1
0
Fork 0
mirror of https://gitlab.com/SIGBUS/nyaa.git synced 2024-12-22 15:30:01 +00:00

Preprocess ES terms for better literal matching

This commit adds a new .exact subfield to display_name, which holds a
barely-filtered version of the original title we can do "literal"
matching against. This is not real substring matching, but quoting
terms now actually does something!

Implements a simple preprocessor for the search terms to extract quoted
parts from the search terms, optionally prefixed with - to negate them.
The preprocessor will create a query that'll join all three query-types:
the simple_query_string, must-phrases and must-not-phrases.
This commit is contained in:
TheAMM 2018-04-13 14:46:35 +03:00
parent f31af836d9
commit 60d1570fb5
2 changed files with 84 additions and 6 deletions

View file

@ -23,6 +23,11 @@ settings:
- my_ngram - my_ngram
- word_delimit - word_delimit
- trim_zero - trim_zero
# For exact matching - simple lowercase + whitespace delimiter
exact_analyzer:
tokenizer: whitespace
filter:
- lowercase
# For matching full words longer than the ngram limit (15 chars) # For matching full words longer than the ngram limit (15 chars)
my_fullword_index_analyzer: my_fullword_index_analyzer:
type: custom type: custom
@ -91,6 +96,10 @@ mappings:
fullword: fullword:
type: text type: text
analyzer: my_fullword_index_analyzer analyzer: my_fullword_index_analyzer
# Stored for exact phrase matching
exact:
type: text
analyzer: exact_analyzer
created_time: created_time:
type: date type: date
# Only in the ES index for generating magnet links # Only in the ES index for generating magnet links

View file

@ -69,6 +69,79 @@ def _generate_query_string(term, category, filter, user):
return params return params
# For preprocessing ES search terms in _parse_es_search_terms
QUOTED_LITERAL_REGEX = re.compile(r'(?i)(-)?"(.*?)"')
def _es_name_exact_phrase(literal):
''' Returns a Query for a phrase match on the display_name for a given literal '''
return Q({
'match_phrase': {
'display_name.exact': {
'query': literal,
'analyzer': 'exact_analyzer'
}
}
})
def _parse_es_search_terms(search, search_terms):
''' Parse search terms into a query with properly handled literal phrases
(the simple_query_string is not so great with exact results).
For example:
foo bar "hello world" -"exclude this"
will become a must simple_query_string for "foo bar", a must phrase_match for
"hello world" and a must_not for "exclude this".
Returns the search with the generated bool-query added to it. '''
# Literal must and must-not sets
must_set = set()
must_not_set = set()
def literal_matcher(match):
negated = bool(match.group(1))
literal = match.group(2)
if negated:
must_not_set.add(literal)
else:
must_set.add(literal)
# Remove the parsed literal from search terms
return ''
# Remove quoted parts (optionally prepended with -) and store them in the sets
parsed_search_terms = QUOTED_LITERAL_REGEX.sub(literal_matcher, search_terms).strip()
# Create phrase matches (if any)
must_queries = [_es_name_exact_phrase(lit) for lit in must_set]
must_not_queries = [_es_name_exact_phrase(lit) for lit in must_not_set]
if parsed_search_terms:
# Normal text search without the quoted parts
must_queries.append(
Q(
'simple_query_string',
# Query both fields, latter for words with >15 chars
fields=['display_name', 'display_name.fullword'],
analyzer='my_search_analyzer',
default_operator="AND",
query=parsed_search_terms
)
)
if must_queries or must_not_queries:
# Create a combined Query with the positive and negative matches
combined_search_query = Q(
'bool',
must=must_queries,
must_not=must_not_queries
)
search = search.query(combined_search_query)
return search
def search_elastic(term='', user=None, sort='id', order='desc', def search_elastic(term='', user=None, sort='id', order='desc',
category='0_0', quality_filter='0', page=1, category='0_0', quality_filter='0', page=1,
rss=False, admin=False, logged_in_user=None, rss=False, admin=False, logged_in_user=None,
@ -165,12 +238,8 @@ def search_elastic(term='', user=None, sort='id', order='desc',
# Apply search term # Apply search term
if term: if term:
s = s.query('simple_query_string', # Do some preprocessing on the search terms for literal "" matching
# Query both fields, latter for words with >15 chars s = _parse_es_search_terms(s, term)
fields=['display_name', 'display_name.fullword'],
analyzer='my_search_analyzer',
default_operator="AND",
query=term)
# User view (/user/username) # User view (/user/username)
if user: if user: