bbm25_haystack.filters
1# SPDX-FileCopyrightText: 2024-present Yuxuan Wang <wangy49@seas.upenn.edu> 2# 3# SPDX-License-Identifier: Apache-2.0 4from collections.abc import Iterable 5from functools import wraps 6from typing import Any, Callable, Final, Optional 7 8import pandas as pd 9from haystack.dataclasses import Document 10from haystack.errors import FilterError 11 12 13def apply_filters_to_document( 14 filters: Optional[dict[str, Any]], document: Document 15) -> bool: 16 """ 17 Apply filters to a document. 18 19 :param filters: The filters to apply to the document. 20 :type filters: dict[str, Any] 21 :param document: The document to apply the filters to. 22 :type document: Document 23 24 :return: True if the document passes the filters. 25 :rtype: bool 26 """ 27 if filters is None or not filters: 28 return True 29 return _run_comparison_condition(filters, document) 30 31 32def _get_document_field(document: Document, field: str) -> Optional[Any]: 33 """ 34 Get the value of a field in a document. 35 36 If the field is not found within the document then, instead of 37 raising an error, `None` is returned. Note that here we do not 38 implicitly add 'meta' prefix for fields that are not a direct 39 attribute of the document, not supporting legacy behavior anymore. 40 41 :param document: The document to get the field value from. 42 :type document: Document 43 :param field: The field to get the value of. 44 :type field: str 45 46 :return: The value of the field in the document. 47 :rtype: Optional[Any] 48 """ 49 if "." not in field: 50 return getattr(document, field) 51 52 attr = document.meta 53 for f in field.split(".")[1:]: 54 attr = attr.get(f) 55 if attr is None: 56 return None 57 return attr 58 59 60def _run_logical_condition(condition: dict[str, Any], document: Document) -> bool: 61 if "operator" not in condition: 62 msg = "Logical condition must have an 'operator' key." 63 raise FilterError(msg) 64 if "conditions" not in condition: 65 msg = "Logical condition must have a 'conditions' key." 66 raise FilterError(msg) 67 68 conditions = condition["conditions"] 69 reducer = LOGICAL_OPERATORS[condition["operator"]] 70 71 return reducer(document, conditions) 72 73 74def _run_comparison_condition(condition: dict[str, Any], document: Document) -> bool: 75 if "field" not in condition: 76 return _run_logical_condition(condition, document) 77 78 if "operator" not in condition: 79 msg = "Comparison condition must have an 'operator' key." 80 raise FilterError(msg) 81 if "value" not in condition: 82 msg = "Comparison condition must have a 'value' key." 83 raise FilterError(msg) 84 85 field: str = condition["field"] 86 value: Any = condition["value"] 87 comparator = COMPARISON_OPERATORS[condition["operator"]] 88 89 return comparator(_get_document_field(document, field), value) 90 91 92def _and(document: Document, conditions: list[dict[str, Any]]) -> bool: 93 """ 94 Return True if all conditions are met. 95 96 :param document: The document to check the conditions against. 97 :type document: Document 98 :param conditions: The conditions to check against the document. 99 :type conditions: list[dict[str, Any]] 100 101 :return: True if not all conditions are met. 102 :rtype: bool 103 """ 104 return all( 105 _run_comparison_condition(condition, document) for condition in conditions 106 ) 107 108 109def _or(document: Document, conditions: list[dict[str, Any]]) -> bool: 110 """ 111 Return True if any condition is met. 112 113 :param document: The document to check the conditions against. 114 :type document: Document 115 :param conditions: The conditions to check against the document. 116 :type conditions: list[dict[str, Any]] 117 118 :return: True if not all conditions are met. 119 :rtype: bool 120 """ 121 return any(_run_comparison_condition(cond, document) for cond in conditions) 122 123 124def _not(document: Document, conditions: list[dict[str, Any]]) -> bool: 125 """ 126 Return True if not all conditions are met. 127 128 The 'NOT' operator is under-specified when supplied with a 129 set of conditions instead of a single condition. Because we 130 can have the semantics of 'at least one False' versus 131 'all False'. Here we choose to comply with the official 132 implementation of Haystack (the 'at least one False' semantics). 133 134 :param document: The document to check the conditions against. 135 :type document: Document 136 :param conditions: The conditions to check against the document. 137 :type conditions: list[dict[str, Any]] 138 139 :return: True if not all conditions are met. 140 :rtype: bool 141 """ 142 return not _and(document, conditions) 143 144 145def _check_comparator_inputs( 146 comparator: Callable[[Any, Any], bool] 147) -> Callable[[Any, Any], bool]: 148 """ 149 A decorator to check and preprocess input attribute values. 150 151 ALL COMPARISON OPERATORS SHOULD BE WRAPPED WITH THIS DECORATOR. 152 because a `False` may be returned by both input validation and 153 the actual comparison. This decorator ensures that the comparison 154 function is only called if the input values are valid. 155 156 :param comparator: The comparator function to wrap. 157 :type comparator: Callable[[Any, Any], bool] 158 159 :return: The wrapped comparator function. 160 :rtype: Callable[[Any, Any], bool] 161 """ 162 163 @wraps(comparator) 164 def _wrapper(dv: Any, fv: Any) -> bool: 165 166 # I think allowing comparison between DataFrames would 167 # be a really bad idea because it would create unexpected 168 # behavior, but I am open to discussion on this. 169 if isinstance(dv, pd.DataFrame) or isinstance(fv, pd.DataFrame): 170 msg = ( 171 "Cannot compare DataFrames. Please convert them to " 172 "simpler data structures before comparing." 173 ) 174 raise FilterError(msg) 175 176 # I think comparison between missing values is ambiguous, 177 # but again, I am open to discussion on this. Here I choose 178 # to return False if either value is None because from a 179 # logical perspective, we really cannot say anything about 180 # the comparison between a missing value and a non-missing. 181 if dv is None or fv is None: 182 return False 183 184 try: 185 return comparator(dv, fv) 186 except TypeError as exc: 187 msg = ( 188 f"Cannot compare document value of {type(dv)} type " 189 f"with filter value of {type(fv)} type." 190 ) 191 raise FilterError(msg) from exc 192 193 return _wrapper 194 195 196@_check_comparator_inputs 197def _eq(dv: Any, fv: Any) -> bool: 198 """ 199 Conservative implementation of equal comparison. 200 201 There are two major differences between this implementation 202 and the default Haystack filter implementation: 203 - If both values are None, we return False, instead of True. 204 - If any value is a DataFrame, we raise an error, instead 205 of converting them to JSON. 206 """ 207 return dv == fv 208 209 210@_check_comparator_inputs 211def _ne(dv: Any, fv: Any) -> bool: 212 return not _eq(dv, fv) 213 214 215@_check_comparator_inputs 216def _gt(dv: Any, fv: Any) -> bool: 217 """ 218 A more liberal implementation with less surprises. 219 220 Simply compare the two values with default Python comparison. 221 We do not perform any conversion here to have the behavior 222 more predictable. If we want to compare the dates, we should 223 just convert the document value and filter value explicitly 224 to dates before comparing them. 225 """ 226 return dv > fv 227 228 229@_check_comparator_inputs 230def _lt(dv: Any, fv: Any) -> bool: 231 return dv < fv 232 233 234@_check_comparator_inputs 235def _gte(dv: Any, fv: Any) -> bool: 236 return _gt(dv, fv) or _eq(dv, fv) 237 238 239@_check_comparator_inputs 240def _lte(dv: Any, fv: Any) -> bool: 241 return _lt(dv, fv) or _eq(dv, fv) 242 243 244@_check_comparator_inputs 245def _in(dv: Any, fv: Any) -> bool: 246 """ 247 Allowing iterable filter values not just lists. 248 249 This implementation permits a larger set of filter values 250 such as tuples, sets, and other iterable objects. 251 """ 252 if not isinstance(fv, Iterable): 253 msg = "Filter value must be an iterable for 'in' comparison." 254 raise FilterError(msg) 255 256 return any(_eq(dv, v) for v in fv) 257 258 259@_check_comparator_inputs 260def _nin(dv: Any, fv: Any) -> bool: 261 return not _in(dv, fv) 262 263 264LOGICAL_OPERATORS: Final = {"NOT": _not, "AND": _and, "OR": _or} 265 266COMPARISON_OPERATORS: Final = { 267 "==": _eq, 268 "!=": _ne, 269 ">": _gt, 270 "<": _lt, 271 ">=": _gte, 272 "<=": _lte, 273 "in": _in, 274 "not in": _nin, 275}
def
apply_filters_to_document( filters: Optional[dict[str, Any]], document: haystack.dataclasses.document.Document) -> bool:
14def apply_filters_to_document( 15 filters: Optional[dict[str, Any]], document: Document 16) -> bool: 17 """ 18 Apply filters to a document. 19 20 :param filters: The filters to apply to the document. 21 :type filters: dict[str, Any] 22 :param document: The document to apply the filters to. 23 :type document: Document 24 25 :return: True if the document passes the filters. 26 :rtype: bool 27 """ 28 if filters is None or not filters: 29 return True 30 return _run_comparison_condition(filters, document)
Apply filters to a document.
Parameters
- filters: The filters to apply to the document.
- document: The document to apply the filters to.
Returns
True if the document passes the filters.
LOGICAL_OPERATORS: Final =
{'NOT': <function _not>, 'AND': <function _and>, 'OR': <function _or>}
COMPARISON_OPERATORS: Final =
{'==': <function _eq>, '!=': <function _ne>, '>': <function _gt>, '<': <function _lt>, '>=': <function _gte>, '<=': <function _lte>, 'in': <function _in>, 'not in': <function _nin>}