Source code for querexfuzz.core

import logging
from pathlib import Path
from types import MethodType
import yaml

import pandas as pd

from .parser import parser
from .engine import execute_query
from .config import QuerexfuzzConfig, FuzzyConfig


logger = logging.getLogger(__name__)

# Define the method name as a class attribute for consistency
_METHOD_NAME = "querex"



[docs]
class Querexfuzz:
    """Manages configuration and attachment of the .querex method."""

    def __init__(self, *, config_path: str | Path | None = None, **kwargs):
        """
        Initializes the Querexfuzz engine with a flexible configuration.

        The configuration can be loaded from a YAML file, provided directly as
        keyword arguments, or both (with keyword arguments overriding the file's
        settings).

        Args:
            config_path (str | Path | None, optional): The path to a YAML
                configuration file. Defaults to None.
            **kwargs: Keyword arguments that correspond to the fields in the
                QuerexfuzzConfig model. These will override any values loaded
                from the config_path.

        Examples:
            >>> # 1. From a file only
            >>> qf = Querexfuzz(config_path='config.yml')

            >>> # 2. From keyword arguments only
            >>> qf = Querexfuzz(base_cols=['name', 'age'], recent_field='mod')

            >>> # 3. From a file with specific overrides
            >>> qf = Querexfuzz(config_path='config.yml', fuzzy={'limit': 200})
        """
        config_data = {}
        if config_path:
            self.config_path = Path(config_path)
            with self.config_path.open("r") as f:
                config_data = yaml.safe_load(f)

        # Keyword arguments override the data loaded from the file
        # inplace update, same as config_data.update(kwargs)
        config_data |= kwargs

        # Validate and create the final config object using Pydantic
        self.config = QuerexfuzzConfig(**config_data)
        # Keyed by id(df): (search_cols, matcher). Populated lazily on first fuzzy call.
        # Mutable dfs (opt-in via attach_to) bypass the cache and rebuild every call.
        self._fuzzy_cache: dict = {}
        self._mutable_ids: set = set()

        logger.info("Querexfuzz engine initialized successfully.")
        # logger.debug(
        #     "Final configuration:\n%s",
        #     self.config.model_dump_json(indent=4)
        # )


[docs]
    @staticmethod
    def parse(expr):
        """Convenience method for testing parser."""
        # note, you can also import parser directly!
        return parser(expr)


    def _query_method(self, df: pd.DataFrame, expr: str) -> pd.DataFrame:
        """The method that is attached to the DataFrame."""
        spec = parser(expr)
        logger.debug("Parsed query spec: %s", spec)
        return execute_query(df, spec, self.config, engine=self)


[docs]
    def attach_to(
        self,
        df: pd.DataFrame,
        method_name: str | None = None,
        alias: str | None = "q",
        mutable: bool = False,
    ) -> pd.DataFrame:
        """
        Attaches the query method to a DataFrame instance.

        Args:
            df (pd.DataFrame): The DataFrame to modify.
            alias (str | None, optional): A short alias for the query method.
                Set to 'q' by default. If None or '', no alias is created.
                Defaults to 'q'.
            mutable (bool): If True, the fuzzy matcher is rebuilt on every call
                instead of being cached. Use when the DataFrame's contents change
                between queries. Defaults to False.

        Returns:
            pd.DataFrame: The same DataFrame, now with the query method attached.
        """
        logger.debug(
            "Attaching .%s method to DataFrame with id: %d", _METHOD_NAME, id(df)
        )
        method_name = method_name or _METHOD_NAME
        setattr(df, method_name, MethodType(self._query_method, df))

        if alias:
            logger.debug("Adding alias '.%s' for the query method.", alias)
            setattr(df, alias, MethodType(self._query_method, df))

        if mutable:
            self._mutable_ids.add(id(df))
            self._fuzzy_cache.pop(id(df), None)
        else:
            self._mutable_ids.discard(id(df))

        return df




# helper factory method

[docs]
def querexfuzz_from_df(
    df,
    *,
    base_cols=None,
    bang_field=None,
    default_date_field=None,
    recent_field=None,
    fuzzy_fields=None,
    score_col_name="score",
    attach=True
):
    """
    Create a Querexfuzz by inspecting df.

    By default, all columns not starting _ are selected to base_cols.
    All date fields to date_fields. If only one, it becomes the
    default_date_field (default for @ clauses) and the recent_field
    (field for sorting for recent).

    If there is only one object field, it becomes the bang field (default
    for regex).
    If fuzzy_fields is None then all object fields are selected. If there
    is only one, it becomes bang_field if that is omitted.

    Highlight mode is true if len(fuzzy_fields)==1.

    Parameters
    ----------

    bang_field, default_date_field, recent_field ->
    score_col_name: name for the score column in fuzzy matches
    attach: attach querexfuzz method to df (default True)

    """
    base_cols = base_cols or [i for i in df.columns if i[0] != "_"]
    date_fields = [
        i
        for i in df.select_dtypes(include=["datetime64[ns]", "datetimetz"]).columns
        if i[0] != "_"
    ]
    if default_date_field is None and len(date_fields) == 1:
        default_date_field = date_fields[0]
    if recent_field is None and len(date_fields) == 1:
        recent_field = date_fields[0]
    elif recent_field is None and default_date_field is not None:
        recent_field = default_date_field
    elif recent_field is not None and default_date_field is None:
        default_date_field = recent_field

    # ensure_list:
    fuzzy_fields = [fuzzy_fields] if isinstance(fuzzy_fields, str) else fuzzy_fields
    fuzzy_fields = fuzzy_fields or [
        i for i in df.select_dtypes(include="object").columns if i[0] != "_"
    ]
    if bang_field is None and len(fuzzy_fields) == 1:
        bang_field = fuzzy_fields[0]

    highlight = len(fuzzy_fields) == 1

    qfz = Querexfuzz(
        base_cols=base_cols,
        date_fields=date_fields,
        default_date_field=default_date_field,
        bang_field=bang_field,
        recent_field=recent_field,
        fuzzy=FuzzyConfig(
            fields=fuzzy_fields,
            limit=50,
            score_col_name=score_col_name,
            highlight=highlight,
        ),
    )
    if attach:
        # this acts in-place
        qfz.attach_to(df)
    return qfz




[docs]
def querexfuzz_help() -> str:
    """Help on the grammar."""
    return """
querexfuzz  Help
================

Query syntax (all clauses optional, must appear in this order)
--------------------------------------------------------------
An empty query returns all base columns for all rows.

verbose                              print query details
recent                               sort by most recent date field
top n | bottom n                     limit to n rows from head or tail
select col1[, col2]                  named columns
select * | **                        * = base cols (default), ** = all cols
select *, -col | **, -col            base/all cols minus col
! regex | field ~ regex              regex filter on bang_field or named field
where sql_expression                 where city == 'Berlin' and age > 30
order|sort by [-]col[, cols]         - prefix for descending order
@[field] unit[-from[:to]]            date range filter
                                     e.g. @m-3, @created_date y-2:1
                                     units: c=calendar year, y, q, m, w, d, h
# fuzzy_term                         fuzzy search (must be last clause)
"""