Source code for querexfuzz.core

import logging
from pathlib import Path
from types import MethodType
import yaml

import pandas as pd

from .parser import parser
from .engine import execute_query
from .config import QuerexfuzzConfig, FuzzyConfig


logger = logging.getLogger(__name__)

# Define the method name as a class attribute for consistency
_METHOD_NAME = "querex"


[docs] class Querexfuzz: """Manages configuration and attachment of the .querex method.""" def __init__(self, *, config_path: str | Path | None = None, **kwargs): """ Initializes the Querexfuzz engine with a flexible configuration. The configuration can be loaded from a YAML file, provided directly as keyword arguments, or both (with keyword arguments overriding the file's settings). Args: config_path (str | Path | None, optional): The path to a YAML configuration file. Defaults to None. **kwargs: Keyword arguments that correspond to the fields in the QuerexfuzzConfig model. These will override any values loaded from the config_path. Examples: >>> # 1. From a file only >>> qf = Querexfuzz(config_path='config.yml') >>> # 2. From keyword arguments only >>> qf = Querexfuzz(base_cols=['name', 'age'], recent_field='mod') >>> # 3. From a file with specific overrides >>> qf = Querexfuzz(config_path='config.yml', fuzzy={'limit': 200}) """ config_data = {} if config_path: self.config_path = Path(config_path) with self.config_path.open("r") as f: config_data = yaml.safe_load(f) # Keyword arguments override the data loaded from the file # inplace update, same as config_data.update(kwargs) config_data |= kwargs # Validate and create the final config object using Pydantic self.config = QuerexfuzzConfig(**config_data) # Keyed by id(df): (search_cols, matcher). Populated lazily on first fuzzy call. # Mutable dfs (opt-in via attach_to) bypass the cache and rebuild every call. self._fuzzy_cache: dict = {} self._mutable_ids: set = set() logger.info("Querexfuzz engine initialized successfully.") # logger.debug( # "Final configuration:\n%s", # self.config.model_dump_json(indent=4) # )
[docs] @staticmethod def parse(expr): """Convenience method for testing parser.""" # note, you can also import parser directly! return parser(expr)
def _query_method(self, df: pd.DataFrame, expr: str) -> pd.DataFrame: """The method that is attached to the DataFrame.""" spec = parser(expr) logger.debug("Parsed query spec: %s", spec) return execute_query(df, spec, self.config, engine=self)
[docs] def attach_to( self, df: pd.DataFrame, method_name: str | None = None, alias: str | None = "q", mutable: bool = False, ) -> pd.DataFrame: """ Attaches the query method to a DataFrame instance. Args: df (pd.DataFrame): The DataFrame to modify. alias (str | None, optional): A short alias for the query method. Set to 'q' by default. If None or '', no alias is created. Defaults to 'q'. mutable (bool): If True, the fuzzy matcher is rebuilt on every call instead of being cached. Use when the DataFrame's contents change between queries. Defaults to False. Returns: pd.DataFrame: The same DataFrame, now with the query method attached. """ logger.debug( "Attaching .%s method to DataFrame with id: %d", _METHOD_NAME, id(df) ) method_name = method_name or _METHOD_NAME setattr(df, method_name, MethodType(self._query_method, df)) if alias: logger.debug("Adding alias '.%s' for the query method.", alias) setattr(df, alias, MethodType(self._query_method, df)) if mutable: self._mutable_ids.add(id(df)) self._fuzzy_cache.pop(id(df), None) else: self._mutable_ids.discard(id(df)) return df
# helper factory method
[docs] def querexfuzz_from_df( df, *, base_cols=None, bang_field=None, default_date_field=None, recent_field=None, fuzzy_fields=None, score_col_name="score", attach=True ): """ Create a Querexfuzz by inspecting df. By default, all columns not starting _ are selected to base_cols. All date fields to date_fields. If only one, it becomes the default_date_field (default for @ clauses) and the recent_field (field for sorting for recent). If there is only one object field, it becomes the bang field (default for regex). If fuzzy_fields is None then all object fields are selected. If there is only one, it becomes bang_field if that is omitted. Highlight mode is true if len(fuzzy_fields)==1. Parameters ---------- bang_field, default_date_field, recent_field -> score_col_name: name for the score column in fuzzy matches attach: attach querexfuzz method to df (default True) """ base_cols = base_cols or [i for i in df.columns if i[0] != "_"] date_fields = [ i for i in df.select_dtypes(include=["datetime64[ns]", "datetimetz"]).columns if i[0] != "_" ] if default_date_field is None and len(date_fields) == 1: default_date_field = date_fields[0] if recent_field is None and len(date_fields) == 1: recent_field = date_fields[0] elif recent_field is None and default_date_field is not None: recent_field = default_date_field elif recent_field is not None and default_date_field is None: default_date_field = recent_field # ensure_list: fuzzy_fields = [fuzzy_fields] if isinstance(fuzzy_fields, str) else fuzzy_fields fuzzy_fields = fuzzy_fields or [ i for i in df.select_dtypes(include="object").columns if i[0] != "_" ] if bang_field is None and len(fuzzy_fields) == 1: bang_field = fuzzy_fields[0] highlight = len(fuzzy_fields) == 1 qfz = Querexfuzz( base_cols=base_cols, date_fields=date_fields, default_date_field=default_date_field, bang_field=bang_field, recent_field=recent_field, fuzzy=FuzzyConfig( fields=fuzzy_fields, limit=50, score_col_name=score_col_name, highlight=highlight, ), ) if attach: # this acts in-place qfz.attach_to(df) return qfz
[docs] def querexfuzz_help() -> str: """Help on the grammar.""" return """ querexfuzz Help ================ Query syntax (all clauses optional, must appear in this order) -------------------------------------------------------------- An empty query returns all base columns for all rows. verbose print query details recent sort by most recent date field top n | bottom n limit to n rows from head or tail select col1[, col2] named columns select * | ** * = base cols (default), ** = all cols select *, -col | **, -col base/all cols minus col ! regex | field ~ regex regex filter on bang_field or named field where sql_expression where city == 'Berlin' and age > 30 order|sort by [-]col[, cols] - prefix for descending order @[field] unit[-from[:to]] date range filter e.g. @m-3, @created_date y-2:1 units: c=calendar year, y, q, m, w, d, h # fuzzy_term fuzzy search (must be last clause) """