1
0
Fork 0
mirror of https://gitlab.com/SIGBUS/nyaa.git synced 2024-12-23 00:39:59 +00:00

Extend ES term preprocessing for OR groups

Implements handling "foo"|"bar" literal OR groups in the Elasticsearch
term preprocessor. Groups can be negated with -, but don't mesh with
precedence (like plain literals).

This is a partial hack, the real solution would be to parse the entire
search terms ourselves, with AND and OR groups, negations etc. But
having that work neatly with the simple_query_string would be bit of a
hassle.
This commit is contained in:
TheAMM 2018-04-15 09:15:54 +03:00
parent 0b78428abc
commit 87502978c3

View file

@ -70,7 +70,18 @@ def _generate_query_string(term, category, filter, user):
# For preprocessing ES search terms in _parse_es_search_terms # For preprocessing ES search terms in _parse_es_search_terms
QUOTED_LITERAL_REGEX = re.compile(r'(?i)(-)?"(.*?)"') QUOTED_LITERAL_REGEX = re.compile(r'(?i)(-)?"(.+?)"')
QUOTED_LITERAL_GROUP_REGEX = re.compile(r'''
(?i)
(-)? # Negate entire group at once
(
".+?" # First literal
(?:
\| # OR
".+?" # Second literal
)+ # repeating
)
''', re.X)
def _es_name_exact_phrase(literal): def _es_name_exact_phrase(literal):
@ -98,7 +109,30 @@ def _parse_es_search_terms(search, search_terms):
must_set = set() must_set = set()
must_not_set = set() must_not_set = set()
def literal_matcher(match): must_or_groups = []
must_not_or_groups = []
def must_group_matcher(match):
''' Grabs [-]"foo"|"bar"[|"baz"...] groups from the search terms '''
negated = bool(match.group(1))
literal_group = match.group(2)
literals = QUOTED_LITERAL_REGEX.findall(literal_group)
group_query = Q(
'bool',
should=[_es_name_exact_phrase(lit_m[1]) for lit_m in literals]
)
if negated:
must_not_or_groups.append(group_query)
else:
must_or_groups.append(group_query)
# Remove the parsed group from search terms
return ''
def must_matcher(match):
''' Grabs [-]"foo" literals from the search terms '''
negated = bool(match.group(1)) negated = bool(match.group(1))
literal = match.group(2) literal = match.group(2)
@ -111,11 +145,12 @@ def _parse_es_search_terms(search, search_terms):
return '' return ''
# Remove quoted parts (optionally prepended with -) and store them in the sets # Remove quoted parts (optionally prepended with -) and store them in the sets
parsed_search_terms = QUOTED_LITERAL_REGEX.sub(literal_matcher, search_terms).strip() parsed_search_terms = QUOTED_LITERAL_GROUP_REGEX.sub(must_group_matcher, search_terms).strip()
parsed_search_terms = QUOTED_LITERAL_REGEX.sub(must_matcher, parsed_search_terms).strip()
# Create phrase matches (if any) # Create phrase matches (if any)
must_queries = [_es_name_exact_phrase(lit) for lit in must_set] must_queries = [_es_name_exact_phrase(lit) for lit in must_set] + must_or_groups
must_not_queries = [_es_name_exact_phrase(lit) for lit in must_not_set] must_not_queries = [_es_name_exact_phrase(lit) for lit in must_not_set] + must_not_or_groups
if parsed_search_terms: if parsed_search_terms:
# Normal text search without the quoted parts # Normal text search without the quoted parts