From 0b78428abc66f248cfcc1161f2e4b714912966a2 Mon Sep 17 00:00:00 2001
From: Anna-Maria Meriniemi <The.Actual.AMM@gmail.com>
Date: Sat, 14 Apr 2018 03:06:25 +0300
Subject: [PATCH] [ES Change] Improve Elasticsearch term quoting (#473)

* Optimize Elasticsearch fullword field

Since the main display_name field ngrams words up to 15 characters,
anything to and under that will already be indexed - the fullword field
(which we have for words longer than 15 characters) needs to index only
words longer than that.

* Preprocess ES terms for better literal matching

This commit adds a new .exact subfield to display_name, which holds a
barely-filtered version of the original title we can do "literal"
matching against. This is not real substring matching, but quoting
terms now actually does something!

Implements a simple preprocessor for the search terms to extract quoted
parts from the search terms, optionally prefixed with - to negate them.
The preprocessor will create a query that'll join all three query-types:
the simple_query_string, must-phrases and must-not-phrases.
---
 es_mapping.yml | 17 ++++++++++-
 nyaa/search.py | 81 ++++++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 91 insertions(+), 7 deletions(-)

diff --git a/es_mapping.yml b/es_mapping.yml
index 4d4e39b..e064ce1 100644
--- a/es_mapping.yml
+++ b/es_mapping.yml
@@ -23,6 +23,11 @@ settings:
           - my_ngram
           - word_delimit
           - trim_zero
+      # For exact matching - simple lowercase + whitespace delimiter
+      exact_analyzer:
+        tokenizer: whitespace
+        filter:
+          - lowercase
       # For matching full words longer than the ngram limit (15 chars)
       my_fullword_index_analyzer:
         type: custom
@@ -32,13 +37,19 @@ settings:
         filter:
           - lowercase
           - word_delimit
-          # These should be enough, as my_index_analyzer will match the rest
+          # Skip tokens shorter than N characters,
+          # since they're already indexed in the main field
+          - fullword_min
 
     filter:
       my_ngram:
         type: edgeNGram
         min_gram: 1
         max_gram: 15
+      fullword_min:
+        type: length
+        # Remember to change this if you change the max_gram below!
+        min: 16
       resolution:
         type: pattern_capture
         patterns: ["(\\d+)[xX](\\d+)"]
@@ -85,6 +96,10 @@ mappings:
           fullword:
             type: text
             analyzer: my_fullword_index_analyzer
+          # Stored for exact phrase matching
+          exact:
+            type: text
+            analyzer: exact_analyzer
       created_time:
         type: date
         # Only in the ES index for generating magnet links
diff --git a/nyaa/search.py b/nyaa/search.py
index 364aeb6..0adb43b 100644
--- a/nyaa/search.py
+++ b/nyaa/search.py
@@ -69,6 +69,79 @@ def _generate_query_string(term, category, filter, user):
     return params
 
 
+# For preprocessing ES search terms in _parse_es_search_terms
+QUOTED_LITERAL_REGEX = re.compile(r'(?i)(-)?"(.*?)"')
+
+
+def _es_name_exact_phrase(literal):
+    ''' Returns a Query for a phrase match on the display_name for a given literal '''
+    return Q({
+        'match_phrase': {
+            'display_name.exact': {
+                'query': literal,
+                'analyzer': 'exact_analyzer'
+            }
+        }
+    })
+
+
+def _parse_es_search_terms(search, search_terms):
+    ''' Parse search terms into a query with properly handled literal phrases
+        (the simple_query_string is not so great with exact results).
+        For example:
+            foo bar "hello world" -"exclude this"
+        will become a must simple_query_string for "foo bar", a must phrase_match for
+        "hello world" and a must_not for "exclude this".
+        Returns the search with the generated bool-query added to it. '''
+
+    # Literal must and must-not sets
+    must_set = set()
+    must_not_set = set()
+
+    def literal_matcher(match):
+        negated = bool(match.group(1))
+        literal = match.group(2)
+
+        if negated:
+            must_not_set.add(literal)
+        else:
+            must_set.add(literal)
+
+        # Remove the parsed literal from search terms
+        return ''
+
+    # Remove quoted parts (optionally prepended with -) and store them in the sets
+    parsed_search_terms = QUOTED_LITERAL_REGEX.sub(literal_matcher, search_terms).strip()
+
+    # Create phrase matches (if any)
+    must_queries = [_es_name_exact_phrase(lit) for lit in must_set]
+    must_not_queries = [_es_name_exact_phrase(lit) for lit in must_not_set]
+
+    if parsed_search_terms:
+        # Normal text search without the quoted parts
+        must_queries.append(
+            Q(
+                'simple_query_string',
+                # Query both fields, latter for words with >15 chars
+                fields=['display_name', 'display_name.fullword'],
+                analyzer='my_search_analyzer',
+                default_operator="AND",
+                query=parsed_search_terms
+            )
+        )
+
+    if must_queries or must_not_queries:
+        # Create a combined Query with the positive and negative matches
+        combined_search_query = Q(
+            'bool',
+            must=must_queries,
+            must_not=must_not_queries
+        )
+        search = search.query(combined_search_query)
+
+    return search
+
+
 def search_elastic(term='', user=None, sort='id', order='desc',
                    category='0_0', quality_filter='0', page=1,
                    rss=False, admin=False, logged_in_user=None,
@@ -165,12 +238,8 @@ def search_elastic(term='', user=None, sort='id', order='desc',
 
     # Apply search term
     if term:
-        s = s.query('simple_query_string',
-                    # Query both fields, latter for words with >15 chars
-                    fields=['display_name', 'display_name.fullword'],
-                    analyzer='my_search_analyzer',
-                    default_operator="AND",
-                    query=term)
+        # Do some preprocessing on the search terms for literal "" matching
+        s = _parse_es_search_terms(s, term)
 
     # User view (/user/username)
     if user: