From 0cc25c3569d22b2734730682aa14ad872aa3a6e7 Mon Sep 17 00:00:00 2001 From: Anna-Maria Meriniemi Date: Sun, 15 Apr 2018 09:53:36 +0300 Subject: [PATCH] [ES] Improve search term preprocessing to include literal groups (#477) * Extend ES term preprocessing for OR groups Implements handling "foo"|"bar" literal OR groups in the Elasticsearch term preprocessor. Groups can be negated with -, but don't mesh with precedence (like plain literals). This is a partial hack, the real solution would be to parse the entire search terms ourselves, with AND and OR groups, negations etc. But having that work neatly with the simple_query_string would be bit of a hassle. * Update help.html search tips since search (quoting strings) has changed a bit. --- nyaa/search.py | 45 +++++++++++++++++++++++++++++++++++----- nyaa/templates/help.html | 14 ++++++++----- 2 files changed, 49 insertions(+), 10 deletions(-) diff --git a/nyaa/search.py b/nyaa/search.py index 0adb43b..e8972a1 100644 --- a/nyaa/search.py +++ b/nyaa/search.py @@ -70,7 +70,18 @@ def _generate_query_string(term, category, filter, user): # For preprocessing ES search terms in _parse_es_search_terms -QUOTED_LITERAL_REGEX = re.compile(r'(?i)(-)?"(.*?)"') +QUOTED_LITERAL_REGEX = re.compile(r'(?i)(-)?"(.+?)"') +QUOTED_LITERAL_GROUP_REGEX = re.compile(r''' + (?i) + (-)? # Negate entire group at once + ( + ".+?" # First literal + (?: + \| # OR + ".+?" # Second literal + )+ # repeating + ) + ''', re.X) def _es_name_exact_phrase(literal): @@ -98,7 +109,30 @@ def _parse_es_search_terms(search, search_terms): must_set = set() must_not_set = set() - def literal_matcher(match): + must_or_groups = [] + must_not_or_groups = [] + + def must_group_matcher(match): + ''' Grabs [-]"foo"|"bar"[|"baz"...] groups from the search terms ''' + negated = bool(match.group(1)) + literal_group = match.group(2) + + literals = QUOTED_LITERAL_REGEX.findall(literal_group) + group_query = Q( + 'bool', + should=[_es_name_exact_phrase(lit_m[1]) for lit_m in literals] + ) + + if negated: + must_not_or_groups.append(group_query) + else: + must_or_groups.append(group_query) + + # Remove the parsed group from search terms + return '' + + def must_matcher(match): + ''' Grabs [-]"foo" literals from the search terms ''' negated = bool(match.group(1)) literal = match.group(2) @@ -111,11 +145,12 @@ def _parse_es_search_terms(search, search_terms): return '' # Remove quoted parts (optionally prepended with -) and store them in the sets - parsed_search_terms = QUOTED_LITERAL_REGEX.sub(literal_matcher, search_terms).strip() + parsed_search_terms = QUOTED_LITERAL_GROUP_REGEX.sub(must_group_matcher, search_terms).strip() + parsed_search_terms = QUOTED_LITERAL_REGEX.sub(must_matcher, parsed_search_terms).strip() # Create phrase matches (if any) - must_queries = [_es_name_exact_phrase(lit) for lit in must_set] - must_not_queries = [_es_name_exact_phrase(lit) for lit in must_not_set] + must_queries = [_es_name_exact_phrase(lit) for lit in must_set] + must_or_groups + must_not_queries = [_es_name_exact_phrase(lit) for lit in must_not_set] + must_not_or_groups if parsed_search_terms: # Normal text search without the quoted parts diff --git a/nyaa/templates/help.html b/nyaa/templates/help.html index 7854feb..f7de873 100644 --- a/nyaa/templates/help.html +++ b/nyaa/templates/help.html @@ -38,21 +38,25 @@
You can combine search terms with the | operator, such as - horrible|cartel. + foo|bar, to match any of the words instead all of them.
To exclude results matching a certain word, prefix them with -, - e.g. FFF -memesubs, which will return torrents with FFF in the - name, but not those which have memesubs in the name as well. + e.g. foo -bar, which will return torrents with foo in the + name, but not those which have bar in the name as well.
If you want to search for a several-word expression in its entirety, you can surround searches with " (double quotes), such as "foo bar", which would match torrents named foo bar but not - those named bar foo. + those named bar foo. You may also use the aforementioned | to group + phrases together: "foo bar"|"foo baz". You can negate the entire + group with - (e.g. -"foo bar"|"foo baz"), but not single items.
- You can also use ( and ) to signify precedence. + You can also use ( and ) to signify precedence, but quoted strings do + not honor this. Using (hello world) "foo bar" is fine, but quoted strings inside + the parentheses will lead to unexpected results.
{{ linkable_header("Reporting Torrents", "reporting") }}