From 87502978c32418c75df140bbc5242a0a3859021d Mon Sep 17 00:00:00 2001 From: TheAMM Date: Sun, 15 Apr 2018 09:15:54 +0300 Subject: [PATCH] Extend ES term preprocessing for OR groups Implements handling "foo"|"bar" literal OR groups in the Elasticsearch term preprocessor. Groups can be negated with -, but don't mesh with precedence (like plain literals). This is a partial hack, the real solution would be to parse the entire search terms ourselves, with AND and OR groups, negations etc. But having that work neatly with the simple_query_string would be bit of a hassle. --- nyaa/search.py | 45 ++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 40 insertions(+), 5 deletions(-) diff --git a/nyaa/search.py b/nyaa/search.py index 0adb43b..e8972a1 100644 --- a/nyaa/search.py +++ b/nyaa/search.py @@ -70,7 +70,18 @@ def _generate_query_string(term, category, filter, user): # For preprocessing ES search terms in _parse_es_search_terms -QUOTED_LITERAL_REGEX = re.compile(r'(?i)(-)?"(.*?)"') +QUOTED_LITERAL_REGEX = re.compile(r'(?i)(-)?"(.+?)"') +QUOTED_LITERAL_GROUP_REGEX = re.compile(r''' + (?i) + (-)? # Negate entire group at once + ( + ".+?" # First literal + (?: + \| # OR + ".+?" # Second literal + )+ # repeating + ) + ''', re.X) def _es_name_exact_phrase(literal): @@ -98,7 +109,30 @@ def _parse_es_search_terms(search, search_terms): must_set = set() must_not_set = set() - def literal_matcher(match): + must_or_groups = [] + must_not_or_groups = [] + + def must_group_matcher(match): + ''' Grabs [-]"foo"|"bar"[|"baz"...] groups from the search terms ''' + negated = bool(match.group(1)) + literal_group = match.group(2) + + literals = QUOTED_LITERAL_REGEX.findall(literal_group) + group_query = Q( + 'bool', + should=[_es_name_exact_phrase(lit_m[1]) for lit_m in literals] + ) + + if negated: + must_not_or_groups.append(group_query) + else: + must_or_groups.append(group_query) + + # Remove the parsed group from search terms + return '' + + def must_matcher(match): + ''' Grabs [-]"foo" literals from the search terms ''' negated = bool(match.group(1)) literal = match.group(2) @@ -111,11 +145,12 @@ def _parse_es_search_terms(search, search_terms): return '' # Remove quoted parts (optionally prepended with -) and store them in the sets - parsed_search_terms = QUOTED_LITERAL_REGEX.sub(literal_matcher, search_terms).strip() + parsed_search_terms = QUOTED_LITERAL_GROUP_REGEX.sub(must_group_matcher, search_terms).strip() + parsed_search_terms = QUOTED_LITERAL_REGEX.sub(must_matcher, parsed_search_terms).strip() # Create phrase matches (if any) - must_queries = [_es_name_exact_phrase(lit) for lit in must_set] - must_not_queries = [_es_name_exact_phrase(lit) for lit in must_not_set] + must_queries = [_es_name_exact_phrase(lit) for lit in must_set] + must_or_groups + must_not_queries = [_es_name_exact_phrase(lit) for lit in must_not_set] + must_not_or_groups if parsed_search_terms: # Normal text search without the quoted parts