From 0b78428abc66f248cfcc1161f2e4b714912966a2 Mon Sep 17 00:00:00 2001 From: Anna-Maria Meriniemi Date: Sat, 14 Apr 2018 03:06:25 +0300 Subject: [PATCH] [ES Change] Improve Elasticsearch term quoting (#473) * Optimize Elasticsearch fullword field Since the main display_name field ngrams words up to 15 characters, anything to and under that will already be indexed - the fullword field (which we have for words longer than 15 characters) needs to index only words longer than that. * Preprocess ES terms for better literal matching This commit adds a new .exact subfield to display_name, which holds a barely-filtered version of the original title we can do "literal" matching against. This is not real substring matching, but quoting terms now actually does something! Implements a simple preprocessor for the search terms to extract quoted parts from the search terms, optionally prefixed with - to negate them. The preprocessor will create a query that'll join all three query-types: the simple_query_string, must-phrases and must-not-phrases. --- es_mapping.yml | 17 ++++++++++- nyaa/search.py | 81 ++++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 91 insertions(+), 7 deletions(-) diff --git a/es_mapping.yml b/es_mapping.yml index 4d4e39b..e064ce1 100644 --- a/es_mapping.yml +++ b/es_mapping.yml @@ -23,6 +23,11 @@ settings: - my_ngram - word_delimit - trim_zero + # For exact matching - simple lowercase + whitespace delimiter + exact_analyzer: + tokenizer: whitespace + filter: + - lowercase # For matching full words longer than the ngram limit (15 chars) my_fullword_index_analyzer: type: custom @@ -32,13 +37,19 @@ settings: filter: - lowercase - word_delimit - # These should be enough, as my_index_analyzer will match the rest + # Skip tokens shorter than N characters, + # since they're already indexed in the main field + - fullword_min filter: my_ngram: type: edgeNGram min_gram: 1 max_gram: 15 + fullword_min: + type: length + # Remember to change this if you change the max_gram below! + min: 16 resolution: type: pattern_capture patterns: ["(\\d+)[xX](\\d+)"] @@ -85,6 +96,10 @@ mappings: fullword: type: text analyzer: my_fullword_index_analyzer + # Stored for exact phrase matching + exact: + type: text + analyzer: exact_analyzer created_time: type: date # Only in the ES index for generating magnet links diff --git a/nyaa/search.py b/nyaa/search.py index 364aeb6..0adb43b 100644 --- a/nyaa/search.py +++ b/nyaa/search.py @@ -69,6 +69,79 @@ def _generate_query_string(term, category, filter, user): return params +# For preprocessing ES search terms in _parse_es_search_terms +QUOTED_LITERAL_REGEX = re.compile(r'(?i)(-)?"(.*?)"') + + +def _es_name_exact_phrase(literal): + ''' Returns a Query for a phrase match on the display_name for a given literal ''' + return Q({ + 'match_phrase': { + 'display_name.exact': { + 'query': literal, + 'analyzer': 'exact_analyzer' + } + } + }) + + +def _parse_es_search_terms(search, search_terms): + ''' Parse search terms into a query with properly handled literal phrases + (the simple_query_string is not so great with exact results). + For example: + foo bar "hello world" -"exclude this" + will become a must simple_query_string for "foo bar", a must phrase_match for + "hello world" and a must_not for "exclude this". + Returns the search with the generated bool-query added to it. ''' + + # Literal must and must-not sets + must_set = set() + must_not_set = set() + + def literal_matcher(match): + negated = bool(match.group(1)) + literal = match.group(2) + + if negated: + must_not_set.add(literal) + else: + must_set.add(literal) + + # Remove the parsed literal from search terms + return '' + + # Remove quoted parts (optionally prepended with -) and store them in the sets + parsed_search_terms = QUOTED_LITERAL_REGEX.sub(literal_matcher, search_terms).strip() + + # Create phrase matches (if any) + must_queries = [_es_name_exact_phrase(lit) for lit in must_set] + must_not_queries = [_es_name_exact_phrase(lit) for lit in must_not_set] + + if parsed_search_terms: + # Normal text search without the quoted parts + must_queries.append( + Q( + 'simple_query_string', + # Query both fields, latter for words with >15 chars + fields=['display_name', 'display_name.fullword'], + analyzer='my_search_analyzer', + default_operator="AND", + query=parsed_search_terms + ) + ) + + if must_queries or must_not_queries: + # Create a combined Query with the positive and negative matches + combined_search_query = Q( + 'bool', + must=must_queries, + must_not=must_not_queries + ) + search = search.query(combined_search_query) + + return search + + def search_elastic(term='', user=None, sort='id', order='desc', category='0_0', quality_filter='0', page=1, rss=False, admin=False, logged_in_user=None, @@ -165,12 +238,8 @@ def search_elastic(term='', user=None, sort='id', order='desc', # Apply search term if term: - s = s.query('simple_query_string', - # Query both fields, latter for words with >15 chars - fields=['display_name', 'display_name.fullword'], - analyzer='my_search_analyzer', - default_operator="AND", - query=term) + # Do some preprocessing on the search terms for literal "" matching + s = _parse_es_search_terms(s, term) # User view (/user/username) if user: