mirror of
https://gitlab.com/SIGBUS/nyaa.git
synced 2025-01-25 00:45:14 +00:00
[ES] Improve search term preprocessing to include literal groups (#477)
* Extend ES term preprocessing for OR groups Implements handling "foo"|"bar" literal OR groups in the Elasticsearch term preprocessor. Groups can be negated with -, but don't mesh with precedence (like plain literals). This is a partial hack, the real solution would be to parse the entire search terms ourselves, with AND and OR groups, negations etc. But having that work neatly with the simple_query_string would be bit of a hassle. * Update help.html search tips since search (quoting strings) has changed a bit.
This commit is contained in:
parent
0b78428abc
commit
0cc25c3569
|
@ -70,7 +70,18 @@ def _generate_query_string(term, category, filter, user):
|
|||
|
||||
|
||||
# For preprocessing ES search terms in _parse_es_search_terms
|
||||
QUOTED_LITERAL_REGEX = re.compile(r'(?i)(-)?"(.*?)"')
|
||||
QUOTED_LITERAL_REGEX = re.compile(r'(?i)(-)?"(.+?)"')
|
||||
QUOTED_LITERAL_GROUP_REGEX = re.compile(r'''
|
||||
(?i)
|
||||
(-)? # Negate entire group at once
|
||||
(
|
||||
".+?" # First literal
|
||||
(?:
|
||||
\| # OR
|
||||
".+?" # Second literal
|
||||
)+ # repeating
|
||||
)
|
||||
''', re.X)
|
||||
|
||||
|
||||
def _es_name_exact_phrase(literal):
|
||||
|
@ -98,7 +109,30 @@ def _parse_es_search_terms(search, search_terms):
|
|||
must_set = set()
|
||||
must_not_set = set()
|
||||
|
||||
def literal_matcher(match):
|
||||
must_or_groups = []
|
||||
must_not_or_groups = []
|
||||
|
||||
def must_group_matcher(match):
|
||||
''' Grabs [-]"foo"|"bar"[|"baz"...] groups from the search terms '''
|
||||
negated = bool(match.group(1))
|
||||
literal_group = match.group(2)
|
||||
|
||||
literals = QUOTED_LITERAL_REGEX.findall(literal_group)
|
||||
group_query = Q(
|
||||
'bool',
|
||||
should=[_es_name_exact_phrase(lit_m[1]) for lit_m in literals]
|
||||
)
|
||||
|
||||
if negated:
|
||||
must_not_or_groups.append(group_query)
|
||||
else:
|
||||
must_or_groups.append(group_query)
|
||||
|
||||
# Remove the parsed group from search terms
|
||||
return ''
|
||||
|
||||
def must_matcher(match):
|
||||
''' Grabs [-]"foo" literals from the search terms '''
|
||||
negated = bool(match.group(1))
|
||||
literal = match.group(2)
|
||||
|
||||
|
@ -111,11 +145,12 @@ def _parse_es_search_terms(search, search_terms):
|
|||
return ''
|
||||
|
||||
# Remove quoted parts (optionally prepended with -) and store them in the sets
|
||||
parsed_search_terms = QUOTED_LITERAL_REGEX.sub(literal_matcher, search_terms).strip()
|
||||
parsed_search_terms = QUOTED_LITERAL_GROUP_REGEX.sub(must_group_matcher, search_terms).strip()
|
||||
parsed_search_terms = QUOTED_LITERAL_REGEX.sub(must_matcher, parsed_search_terms).strip()
|
||||
|
||||
# Create phrase matches (if any)
|
||||
must_queries = [_es_name_exact_phrase(lit) for lit in must_set]
|
||||
must_not_queries = [_es_name_exact_phrase(lit) for lit in must_not_set]
|
||||
must_queries = [_es_name_exact_phrase(lit) for lit in must_set] + must_or_groups
|
||||
must_not_queries = [_es_name_exact_phrase(lit) for lit in must_not_set] + must_not_or_groups
|
||||
|
||||
if parsed_search_terms:
|
||||
# Normal text search without the quoted parts
|
||||
|
|
|
@ -38,21 +38,25 @@
|
|||
</div>
|
||||
<div>
|
||||
You can combine search terms with the <kbd>|</kbd> operator, such as
|
||||
<kbd>horrible|cartel</kbd>.
|
||||
<kbd>foo|bar</kbd>, to match any of the words instead all of them.
|
||||
</div>
|
||||
<div>
|
||||
To exclude results matching a certain word, prefix them with <kbd>-</kbd>,
|
||||
e.g. <kbd>FFF -memesubs</kbd>, which will return torrents with <em>FFF</em> in the
|
||||
name, but not those which have <em>memesubs</em> in the name as well.
|
||||
e.g. <kbd>foo -bar</kbd>, which will return torrents with <em>foo</em> in the
|
||||
name, but not those which have <em>bar</em> in the name as well.
|
||||
</div>
|
||||
<div>
|
||||
If you want to search for a several-word expression in its entirety, you can
|
||||
surround searches with <kbd>"</kbd> (double quotes), such as
|
||||
<kbd>"foo bar"</kbd>, which would match torrents named <em>foo bar</em> but not
|
||||
those named <em>bar foo</em>.
|
||||
those named <em>bar foo</em>. You may also use the aforementioned <kbd>|</kbd> to group
|
||||
phrases together: <kbd>"foo bar"|"foo baz"</kbd>. You can negate the entire
|
||||
group with <kbd>-</kbd> (e.g. <kbd>-"foo bar"|"foo baz"</kbd>), but not single items.
|
||||
</div>
|
||||
<div>
|
||||
You can also use <kbd>(</kbd> and <kbd>)</kbd> to signify precedence.
|
||||
You can also use <kbd>(</kbd> and <kbd>)</kbd> to signify precedence, but quoted strings do
|
||||
not honor this. Using <kbd>(hello world) "foo bar"</kbd> is fine, but quoted strings inside
|
||||
the parentheses will lead to unexpected results.
|
||||
</div>
|
||||
|
||||
{{ linkable_header("Reporting Torrents", "reporting") }}
|
||||
|
|
Loading…
Reference in a new issue