bbm25_haystack.filters

  1# SPDX-FileCopyrightText: 2024-present Yuxuan Wang <wangy49@seas.upenn.edu>
  2#
  3# SPDX-License-Identifier: Apache-2.0
  4from collections.abc import Iterable
  5from functools import wraps
  6from typing import Any, Callable, Final, Optional
  7
  8import pandas as pd
  9from haystack.dataclasses import Document
 10from haystack.errors import FilterError
 11
 12
 13def apply_filters_to_document(
 14    filters: Optional[dict[str, Any]], document: Document
 15) -> bool:
 16    """
 17    Apply filters to a document.
 18
 19    :param filters: The filters to apply to the document.
 20    :type filters: dict[str, Any]
 21    :param document: The document to apply the filters to.
 22    :type document: Document
 23
 24    :return: True if the document passes the filters.
 25    :rtype: bool
 26    """
 27    if filters is None or not filters:
 28        return True
 29    return _run_comparison_condition(filters, document)
 30
 31
 32def _get_document_field(document: Document, field: str) -> Optional[Any]:
 33    """
 34    Get the value of a field in a document.
 35
 36    If the field is not found within the document then, instead of
 37    raising an error, `None` is returned. Note that here we do not
 38    implicitly add 'meta' prefix for fields that are not a direct
 39    attribute of the document, not supporting legacy behavior anymore.
 40
 41    :param document: The document to get the field value from.
 42    :type document: Document
 43    :param field: The field to get the value of.
 44    :type field: str
 45
 46    :return: The value of the field in the document.
 47    :rtype: Optional[Any]
 48    """
 49    if "." not in field:
 50        return getattr(document, field)
 51
 52    attr = document.meta
 53    for f in field.split(".")[1:]:
 54        attr = attr.get(f)
 55        if attr is None:
 56            return None
 57    return attr
 58
 59
 60def _run_logical_condition(condition: dict[str, Any], document: Document) -> bool:
 61    if "operator" not in condition:
 62        msg = "Logical condition must have an 'operator' key."
 63        raise FilterError(msg)
 64    if "conditions" not in condition:
 65        msg = "Logical condition must have a 'conditions' key."
 66        raise FilterError(msg)
 67
 68    conditions = condition["conditions"]
 69    reducer = LOGICAL_OPERATORS[condition["operator"]]
 70
 71    return reducer(document, conditions)
 72
 73
 74def _run_comparison_condition(condition: dict[str, Any], document: Document) -> bool:
 75    if "field" not in condition:
 76        return _run_logical_condition(condition, document)
 77
 78    if "operator" not in condition:
 79        msg = "Comparison condition must have an 'operator' key."
 80        raise FilterError(msg)
 81    if "value" not in condition:
 82        msg = "Comparison condition must have a 'value' key."
 83        raise FilterError(msg)
 84
 85    field: str = condition["field"]
 86    value: Any = condition["value"]
 87    comparator = COMPARISON_OPERATORS[condition["operator"]]
 88
 89    return comparator(_get_document_field(document, field), value)
 90
 91
 92def _and(document: Document, conditions: list[dict[str, Any]]) -> bool:
 93    """
 94    Return True if all conditions are met.
 95
 96    :param document: The document to check the conditions against.
 97    :type document: Document
 98    :param conditions: The conditions to check against the document.
 99    :type conditions: list[dict[str, Any]]
100
101    :return: True if not all conditions are met.
102    :rtype: bool
103    """
104    return all(
105        _run_comparison_condition(condition, document) for condition in conditions
106    )
107
108
109def _or(document: Document, conditions: list[dict[str, Any]]) -> bool:
110    """
111    Return True if any condition is met.
112
113    :param document: The document to check the conditions against.
114    :type document: Document
115    :param conditions: The conditions to check against the document.
116    :type conditions: list[dict[str, Any]]
117
118    :return: True if not all conditions are met.
119    :rtype: bool
120    """
121    return any(_run_comparison_condition(cond, document) for cond in conditions)
122
123
124def _not(document: Document, conditions: list[dict[str, Any]]) -> bool:
125    """
126    Return True if not all conditions are met.
127
128    The 'NOT' operator is under-specified when supplied with a
129    set of conditions instead of a single condition. Because we
130    can have the semantics of 'at least one False' versus
131    'all False'. Here we choose to comply with the official
132    implementation of Haystack (the 'at least one False' semantics).
133
134    :param document: The document to check the conditions against.
135    :type document: Document
136    :param conditions: The conditions to check against the document.
137    :type conditions: list[dict[str, Any]]
138
139    :return: True if not all conditions are met.
140    :rtype: bool
141    """
142    return not _and(document, conditions)
143
144
145def _check_comparator_inputs(
146    comparator: Callable[[Any, Any], bool]
147) -> Callable[[Any, Any], bool]:
148    """
149    A decorator to check and preprocess input attribute values.
150
151    ALL COMPARISON OPERATORS SHOULD BE WRAPPED WITH THIS DECORATOR.
152    because a `False` may be returned by both input validation and
153    the actual comparison. This decorator ensures that the comparison
154    function is only called if the input values are valid.
155
156    :param comparator: The comparator function to wrap.
157    :type comparator: Callable[[Any, Any], bool]
158
159    :return: The wrapped comparator function.
160    :rtype: Callable[[Any, Any], bool]
161    """
162
163    @wraps(comparator)
164    def _wrapper(dv: Any, fv: Any) -> bool:
165
166        # I think allowing comparison between DataFrames would
167        # be a really bad idea because it would create unexpected
168        # behavior, but I am open to discussion on this.
169        if isinstance(dv, pd.DataFrame) or isinstance(fv, pd.DataFrame):
170            msg = (
171                "Cannot compare DataFrames. Please convert them to "
172                "simpler data structures before comparing."
173            )
174            raise FilterError(msg)
175
176        # I think comparison between missing values is ambiguous,
177        # but again, I am open to discussion on this. Here I choose
178        # to return False if either value is None because from a
179        # logical perspective, we really cannot say anything about
180        # the comparison between a missing value and a non-missing.
181        if dv is None or fv is None:
182            return False
183
184        try:
185            return comparator(dv, fv)
186        except TypeError as exc:
187            msg = (
188                f"Cannot compare document value of {type(dv)} type "
189                f"with filter value of {type(fv)} type."
190            )
191            raise FilterError(msg) from exc
192
193    return _wrapper
194
195
196@_check_comparator_inputs
197def _eq(dv: Any, fv: Any) -> bool:
198    """
199    Conservative implementation of equal comparison.
200
201    There are two major differences between this implementation
202    and the default Haystack filter implementation:
203        - If both values are None, we return False, instead of True.
204        - If any value is a DataFrame, we raise an error, instead
205            of converting them to JSON.
206    """
207    return dv == fv
208
209
210@_check_comparator_inputs
211def _ne(dv: Any, fv: Any) -> bool:
212    return not _eq(dv, fv)
213
214
215@_check_comparator_inputs
216def _gt(dv: Any, fv: Any) -> bool:
217    """
218    A more liberal implementation with less surprises.
219
220    Simply compare the two values with default Python comparison.
221    We do not perform any conversion here to have the behavior
222    more predictable. If we want to compare the dates, we should
223    just convert the document value and filter value explicitly
224    to dates before comparing them.
225    """
226    return dv > fv
227
228
229@_check_comparator_inputs
230def _lt(dv: Any, fv: Any) -> bool:
231    return dv < fv
232
233
234@_check_comparator_inputs
235def _gte(dv: Any, fv: Any) -> bool:
236    return _gt(dv, fv) or _eq(dv, fv)
237
238
239@_check_comparator_inputs
240def _lte(dv: Any, fv: Any) -> bool:
241    return _lt(dv, fv) or _eq(dv, fv)
242
243
244@_check_comparator_inputs
245def _in(dv: Any, fv: Any) -> bool:
246    """
247    Allowing iterable filter values not just lists.
248
249    This implementation permits a larger set of filter values
250    such as tuples, sets, and other iterable objects.
251    """
252    if not isinstance(fv, Iterable):
253        msg = "Filter value must be an iterable for 'in' comparison."
254        raise FilterError(msg)
255
256    return any(_eq(dv, v) for v in fv)
257
258
259@_check_comparator_inputs
260def _nin(dv: Any, fv: Any) -> bool:
261    return not _in(dv, fv)
262
263
264LOGICAL_OPERATORS: Final = {"NOT": _not, "AND": _and, "OR": _or}
265
266COMPARISON_OPERATORS: Final = {
267    "==": _eq,
268    "!=": _ne,
269    ">": _gt,
270    "<": _lt,
271    ">=": _gte,
272    "<=": _lte,
273    "in": _in,
274    "not in": _nin,
275}
def apply_filters_to_document( filters: Optional[dict[str, Any]], document: haystack.dataclasses.document.Document) -> bool:
14def apply_filters_to_document(
15    filters: Optional[dict[str, Any]], document: Document
16) -> bool:
17    """
18    Apply filters to a document.
19
20    :param filters: The filters to apply to the document.
21    :type filters: dict[str, Any]
22    :param document: The document to apply the filters to.
23    :type document: Document
24
25    :return: True if the document passes the filters.
26    :rtype: bool
27    """
28    if filters is None or not filters:
29        return True
30    return _run_comparison_condition(filters, document)

Apply filters to a document.

Parameters
  • filters: The filters to apply to the document.
  • document: The document to apply the filters to.
Returns

True if the document passes the filters.

LOGICAL_OPERATORS: Final = {'NOT': <function _not>, 'AND': <function _and>, 'OR': <function _or>}
COMPARISON_OPERATORS: Final = {'==': <function _eq>, '!=': <function _ne>, '>': <function _gt>, '<': <function _lt>, '>=': <function _gte>, '<=': <function _lte>, 'in': <function _in>, 'not in': <function _nin>}