From 60d1570fb5b7dc54972bc66b7cc7d0244106efdf Mon Sep 17 00:00:00 2001 From: TheAMM Date: Fri, 13 Apr 2018 14:46:35 +0300 Subject: [PATCH] Preprocess ES terms for better literal matching This commit adds a new .exact subfield to display_name, which holds a barely-filtered version of the original title we can do "literal" matching against. This is not real substring matching, but quoting terms now actually does something! Implements a simple preprocessor for the search terms to extract quoted parts from the search terms, optionally prefixed with - to negate them. The preprocessor will create a query that'll join all three query-types: the simple_query_string, must-phrases and must-not-phrases. --- es_mapping.yml | 9 ++++++ nyaa/search.py | 81 ++++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 84 insertions(+), 6 deletions(-) diff --git a/es_mapping.yml b/es_mapping.yml index 6001305..e064ce1 100644 --- a/es_mapping.yml +++ b/es_mapping.yml @@ -23,6 +23,11 @@ settings: - my_ngram - word_delimit - trim_zero + # For exact matching - simple lowercase + whitespace delimiter + exact_analyzer: + tokenizer: whitespace + filter: + - lowercase # For matching full words longer than the ngram limit (15 chars) my_fullword_index_analyzer: type: custom @@ -91,6 +96,10 @@ mappings: fullword: type: text analyzer: my_fullword_index_analyzer + # Stored for exact phrase matching + exact: + type: text + analyzer: exact_analyzer created_time: type: date # Only in the ES index for generating magnet links diff --git a/nyaa/search.py b/nyaa/search.py index 364aeb6..0adb43b 100644 --- a/nyaa/search.py +++ b/nyaa/search.py @@ -69,6 +69,79 @@ def _generate_query_string(term, category, filter, user): return params +# For preprocessing ES search terms in _parse_es_search_terms +QUOTED_LITERAL_REGEX = re.compile(r'(?i)(-)?"(.*?)"') + + +def _es_name_exact_phrase(literal): + ''' Returns a Query for a phrase match on the display_name for a given literal ''' + return Q({ + 'match_phrase': { + 'display_name.exact': { + 'query': literal, + 'analyzer': 'exact_analyzer' + } + } + }) + + +def _parse_es_search_terms(search, search_terms): + ''' Parse search terms into a query with properly handled literal phrases + (the simple_query_string is not so great with exact results). + For example: + foo bar "hello world" -"exclude this" + will become a must simple_query_string for "foo bar", a must phrase_match for + "hello world" and a must_not for "exclude this". + Returns the search with the generated bool-query added to it. ''' + + # Literal must and must-not sets + must_set = set() + must_not_set = set() + + def literal_matcher(match): + negated = bool(match.group(1)) + literal = match.group(2) + + if negated: + must_not_set.add(literal) + else: + must_set.add(literal) + + # Remove the parsed literal from search terms + return '' + + # Remove quoted parts (optionally prepended with -) and store them in the sets + parsed_search_terms = QUOTED_LITERAL_REGEX.sub(literal_matcher, search_terms).strip() + + # Create phrase matches (if any) + must_queries = [_es_name_exact_phrase(lit) for lit in must_set] + must_not_queries = [_es_name_exact_phrase(lit) for lit in must_not_set] + + if parsed_search_terms: + # Normal text search without the quoted parts + must_queries.append( + Q( + 'simple_query_string', + # Query both fields, latter for words with >15 chars + fields=['display_name', 'display_name.fullword'], + analyzer='my_search_analyzer', + default_operator="AND", + query=parsed_search_terms + ) + ) + + if must_queries or must_not_queries: + # Create a combined Query with the positive and negative matches + combined_search_query = Q( + 'bool', + must=must_queries, + must_not=must_not_queries + ) + search = search.query(combined_search_query) + + return search + + def search_elastic(term='', user=None, sort='id', order='desc', category='0_0', quality_filter='0', page=1, rss=False, admin=False, logged_in_user=None, @@ -165,12 +238,8 @@ def search_elastic(term='', user=None, sort='id', order='desc', # Apply search term if term: - s = s.query('simple_query_string', - # Query both fields, latter for words with >15 chars - fields=['display_name', 'display_name.fullword'], - analyzer='my_search_analyzer', - default_operator="AND", - query=term) + # Do some preprocessing on the search terms for literal "" matching + s = _parse_es_search_terms(s, term) # User view (/user/username) if user: