mirror of
https://gitlab.com/SIGBUS/nyaa.git
synced 2025-01-26 06:55:14 +00:00
[ES Change] Improve Elasticsearch term quoting (#473)
* Optimize Elasticsearch fullword field Since the main display_name field ngrams words up to 15 characters, anything to and under that will already be indexed - the fullword field (which we have for words longer than 15 characters) needs to index only words longer than that. * Preprocess ES terms for better literal matching This commit adds a new .exact subfield to display_name, which holds a barely-filtered version of the original title we can do "literal" matching against. This is not real substring matching, but quoting terms now actually does something! Implements a simple preprocessor for the search terms to extract quoted parts from the search terms, optionally prefixed with - to negate them. The preprocessor will create a query that'll join all three query-types: the simple_query_string, must-phrases and must-not-phrases.
This commit is contained in:
parent
8f9400bb5f
commit
0b78428abc
|
@ -23,6 +23,11 @@ settings:
|
||||||
- my_ngram
|
- my_ngram
|
||||||
- word_delimit
|
- word_delimit
|
||||||
- trim_zero
|
- trim_zero
|
||||||
|
# For exact matching - simple lowercase + whitespace delimiter
|
||||||
|
exact_analyzer:
|
||||||
|
tokenizer: whitespace
|
||||||
|
filter:
|
||||||
|
- lowercase
|
||||||
# For matching full words longer than the ngram limit (15 chars)
|
# For matching full words longer than the ngram limit (15 chars)
|
||||||
my_fullword_index_analyzer:
|
my_fullword_index_analyzer:
|
||||||
type: custom
|
type: custom
|
||||||
|
@ -32,13 +37,19 @@ settings:
|
||||||
filter:
|
filter:
|
||||||
- lowercase
|
- lowercase
|
||||||
- word_delimit
|
- word_delimit
|
||||||
# These should be enough, as my_index_analyzer will match the rest
|
# Skip tokens shorter than N characters,
|
||||||
|
# since they're already indexed in the main field
|
||||||
|
- fullword_min
|
||||||
|
|
||||||
filter:
|
filter:
|
||||||
my_ngram:
|
my_ngram:
|
||||||
type: edgeNGram
|
type: edgeNGram
|
||||||
min_gram: 1
|
min_gram: 1
|
||||||
max_gram: 15
|
max_gram: 15
|
||||||
|
fullword_min:
|
||||||
|
type: length
|
||||||
|
# Remember to change this if you change the max_gram below!
|
||||||
|
min: 16
|
||||||
resolution:
|
resolution:
|
||||||
type: pattern_capture
|
type: pattern_capture
|
||||||
patterns: ["(\\d+)[xX](\\d+)"]
|
patterns: ["(\\d+)[xX](\\d+)"]
|
||||||
|
@ -85,6 +96,10 @@ mappings:
|
||||||
fullword:
|
fullword:
|
||||||
type: text
|
type: text
|
||||||
analyzer: my_fullword_index_analyzer
|
analyzer: my_fullword_index_analyzer
|
||||||
|
# Stored for exact phrase matching
|
||||||
|
exact:
|
||||||
|
type: text
|
||||||
|
analyzer: exact_analyzer
|
||||||
created_time:
|
created_time:
|
||||||
type: date
|
type: date
|
||||||
# Only in the ES index for generating magnet links
|
# Only in the ES index for generating magnet links
|
||||||
|
|
|
@ -69,6 +69,79 @@ def _generate_query_string(term, category, filter, user):
|
||||||
return params
|
return params
|
||||||
|
|
||||||
|
|
||||||
|
# For preprocessing ES search terms in _parse_es_search_terms
|
||||||
|
QUOTED_LITERAL_REGEX = re.compile(r'(?i)(-)?"(.*?)"')
|
||||||
|
|
||||||
|
|
||||||
|
def _es_name_exact_phrase(literal):
|
||||||
|
''' Returns a Query for a phrase match on the display_name for a given literal '''
|
||||||
|
return Q({
|
||||||
|
'match_phrase': {
|
||||||
|
'display_name.exact': {
|
||||||
|
'query': literal,
|
||||||
|
'analyzer': 'exact_analyzer'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_es_search_terms(search, search_terms):
|
||||||
|
''' Parse search terms into a query with properly handled literal phrases
|
||||||
|
(the simple_query_string is not so great with exact results).
|
||||||
|
For example:
|
||||||
|
foo bar "hello world" -"exclude this"
|
||||||
|
will become a must simple_query_string for "foo bar", a must phrase_match for
|
||||||
|
"hello world" and a must_not for "exclude this".
|
||||||
|
Returns the search with the generated bool-query added to it. '''
|
||||||
|
|
||||||
|
# Literal must and must-not sets
|
||||||
|
must_set = set()
|
||||||
|
must_not_set = set()
|
||||||
|
|
||||||
|
def literal_matcher(match):
|
||||||
|
negated = bool(match.group(1))
|
||||||
|
literal = match.group(2)
|
||||||
|
|
||||||
|
if negated:
|
||||||
|
must_not_set.add(literal)
|
||||||
|
else:
|
||||||
|
must_set.add(literal)
|
||||||
|
|
||||||
|
# Remove the parsed literal from search terms
|
||||||
|
return ''
|
||||||
|
|
||||||
|
# Remove quoted parts (optionally prepended with -) and store them in the sets
|
||||||
|
parsed_search_terms = QUOTED_LITERAL_REGEX.sub(literal_matcher, search_terms).strip()
|
||||||
|
|
||||||
|
# Create phrase matches (if any)
|
||||||
|
must_queries = [_es_name_exact_phrase(lit) for lit in must_set]
|
||||||
|
must_not_queries = [_es_name_exact_phrase(lit) for lit in must_not_set]
|
||||||
|
|
||||||
|
if parsed_search_terms:
|
||||||
|
# Normal text search without the quoted parts
|
||||||
|
must_queries.append(
|
||||||
|
Q(
|
||||||
|
'simple_query_string',
|
||||||
|
# Query both fields, latter for words with >15 chars
|
||||||
|
fields=['display_name', 'display_name.fullword'],
|
||||||
|
analyzer='my_search_analyzer',
|
||||||
|
default_operator="AND",
|
||||||
|
query=parsed_search_terms
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
if must_queries or must_not_queries:
|
||||||
|
# Create a combined Query with the positive and negative matches
|
||||||
|
combined_search_query = Q(
|
||||||
|
'bool',
|
||||||
|
must=must_queries,
|
||||||
|
must_not=must_not_queries
|
||||||
|
)
|
||||||
|
search = search.query(combined_search_query)
|
||||||
|
|
||||||
|
return search
|
||||||
|
|
||||||
|
|
||||||
def search_elastic(term='', user=None, sort='id', order='desc',
|
def search_elastic(term='', user=None, sort='id', order='desc',
|
||||||
category='0_0', quality_filter='0', page=1,
|
category='0_0', quality_filter='0', page=1,
|
||||||
rss=False, admin=False, logged_in_user=None,
|
rss=False, admin=False, logged_in_user=None,
|
||||||
|
@ -165,12 +238,8 @@ def search_elastic(term='', user=None, sort='id', order='desc',
|
||||||
|
|
||||||
# Apply search term
|
# Apply search term
|
||||||
if term:
|
if term:
|
||||||
s = s.query('simple_query_string',
|
# Do some preprocessing on the search terms for literal "" matching
|
||||||
# Query both fields, latter for words with >15 chars
|
s = _parse_es_search_terms(s, term)
|
||||||
fields=['display_name', 'display_name.fullword'],
|
|
||||||
analyzer='my_search_analyzer',
|
|
||||||
default_operator="AND",
|
|
||||||
query=term)
|
|
||||||
|
|
||||||
# User view (/user/username)
|
# User view (/user/username)
|
||||||
if user:
|
if user:
|
||||||
|
|
Loading…
Reference in a new issue