1
0
Fork 0
mirror of https://gitlab.com/SIGBUS/nyaa.git synced 2024-12-22 19:10:00 +00:00

Extend ES term preprocessing for OR groups

Implements handling "foo"|"bar" literal OR groups in the Elasticsearch
term preprocessor. Groups can be negated with -, but don't mesh with
precedence (like plain literals).

This is a partial hack, the real solution would be to parse the entire
search terms ourselves, with AND and OR groups, negations etc. But
having that work neatly with the simple_query_string would be bit of a
hassle.
This commit is contained in:
TheAMM 2018-04-15 09:15:54 +03:00
parent 0b78428abc
commit 87502978c3

View file

@ -70,7 +70,18 @@ def _generate_query_string(term, category, filter, user):
# For preprocessing ES search terms in _parse_es_search_terms
QUOTED_LITERAL_REGEX = re.compile(r'(?i)(-)?"(.*?)"')
QUOTED_LITERAL_REGEX = re.compile(r'(?i)(-)?"(.+?)"')
QUOTED_LITERAL_GROUP_REGEX = re.compile(r'''
(?i)
(-)? # Negate entire group at once
(
".+?" # First literal
(?:
\| # OR
".+?" # Second literal
)+ # repeating
)
''', re.X)
def _es_name_exact_phrase(literal):
@ -98,7 +109,30 @@ def _parse_es_search_terms(search, search_terms):
must_set = set()
must_not_set = set()
def literal_matcher(match):
must_or_groups = []
must_not_or_groups = []
def must_group_matcher(match):
''' Grabs [-]"foo"|"bar"[|"baz"...] groups from the search terms '''
negated = bool(match.group(1))
literal_group = match.group(2)
literals = QUOTED_LITERAL_REGEX.findall(literal_group)
group_query = Q(
'bool',
should=[_es_name_exact_phrase(lit_m[1]) for lit_m in literals]
)
if negated:
must_not_or_groups.append(group_query)
else:
must_or_groups.append(group_query)
# Remove the parsed group from search terms
return ''
def must_matcher(match):
''' Grabs [-]"foo" literals from the search terms '''
negated = bool(match.group(1))
literal = match.group(2)
@ -111,11 +145,12 @@ def _parse_es_search_terms(search, search_terms):
return ''
# Remove quoted parts (optionally prepended with -) and store them in the sets
parsed_search_terms = QUOTED_LITERAL_REGEX.sub(literal_matcher, search_terms).strip()
parsed_search_terms = QUOTED_LITERAL_GROUP_REGEX.sub(must_group_matcher, search_terms).strip()
parsed_search_terms = QUOTED_LITERAL_REGEX.sub(must_matcher, parsed_search_terms).strip()
# Create phrase matches (if any)
must_queries = [_es_name_exact_phrase(lit) for lit in must_set]
must_not_queries = [_es_name_exact_phrase(lit) for lit in must_not_set]
must_queries = [_es_name_exact_phrase(lit) for lit in must_set] + must_or_groups
must_not_queries = [_es_name_exact_phrase(lit) for lit in must_not_set] + must_not_or_groups
if parsed_search_terms:
# Normal text search without the quoted parts