mirror of
https://gitlab.com/SIGBUS/nyaa.git
synced 2024-11-14 17:19:15 +00:00
59db958977
Before, long.tokens.with.dots.or.dashes would get edgengrammed up to the ngram limit, so we'd get to long.tokens.wit which would then be split - discarding "with.dots.or.dashes" completely. The fullword index would keep the complete large token, but without any ngramming, so incomplete searches (like "tokens") would not match it, only the full token. Now, we split words before ngramming them, so the main index will properly handle words up to the ngram limit. The fullword index will still handle the longer words for non-ngram matching. Also optimized away duplicate tokens from the indices (since we rely on boolean matching, not scoring) to save a couple megabytes of space.
140 lines
3.6 KiB
YAML
140 lines
3.6 KiB
YAML
---
|
|
# CREATE DTABASE/TABLE equivalent for elasticsearch, in yaml
|
|
# fo inline comments.
|
|
settings:
|
|
analysis:
|
|
analyzer:
|
|
my_search_analyzer:
|
|
type: custom
|
|
tokenizer: standard
|
|
char_filter:
|
|
- my_char_filter
|
|
filter:
|
|
- standard
|
|
- lowercase
|
|
my_index_analyzer:
|
|
type: custom
|
|
tokenizer: standard
|
|
char_filter:
|
|
- my_char_filter
|
|
filter:
|
|
- resolution
|
|
- lowercase
|
|
- word_delimit
|
|
- my_ngram
|
|
- trim_zero
|
|
- unique
|
|
# For exact matching - simple lowercase + whitespace delimiter
|
|
exact_analyzer:
|
|
tokenizer: whitespace
|
|
filter:
|
|
- lowercase
|
|
# For matching full words longer than the ngram limit (15 chars)
|
|
my_fullword_index_analyzer:
|
|
type: custom
|
|
tokenizer: standard
|
|
char_filter:
|
|
- my_char_filter
|
|
filter:
|
|
- lowercase
|
|
- word_delimit
|
|
# Skip tokens shorter than N characters,
|
|
# since they're already indexed in the main field
|
|
- fullword_min
|
|
- unique
|
|
|
|
filter:
|
|
my_ngram:
|
|
type: edgeNGram
|
|
min_gram: 1
|
|
max_gram: 15
|
|
fullword_min:
|
|
type: length
|
|
# Remember to change this if you change the max_gram below!
|
|
min: 16
|
|
resolution:
|
|
type: pattern_capture
|
|
patterns: ["(\\d+)[xX](\\d+)"]
|
|
trim_zero:
|
|
type: pattern_capture
|
|
patterns: ["0*([0-9]*)"]
|
|
word_delimit:
|
|
type: word_delimiter
|
|
preserve_original: true
|
|
split_on_numerics: false
|
|
char_filter:
|
|
my_char_filter:
|
|
type: mapping
|
|
mappings: ["-=>_", "!=>_", "_=>\\u0020"]
|
|
index:
|
|
# we're running a single es node, so no sharding necessary,
|
|
# plus replicas don't really help either.
|
|
number_of_shards: 1
|
|
number_of_replicas : 0
|
|
mapper:
|
|
# disable elasticsearch's "helpful" autoschema
|
|
dynamic: false
|
|
# since we disabled the _all field, default query the
|
|
# name of the torrent.
|
|
query:
|
|
default_field: display_name
|
|
mappings:
|
|
torrent:
|
|
# don't want everything concatenated
|
|
_all:
|
|
enabled: false
|
|
properties:
|
|
id:
|
|
type: long
|
|
display_name:
|
|
# TODO could do a fancier tokenizer here to parse out the
|
|
# the scene convention of stuff in brackets, plus stuff like k-on
|
|
type: text
|
|
analyzer: my_index_analyzer
|
|
fielddata: true # Is this required?
|
|
fields:
|
|
# Multi-field for full-word matching (when going over ngram limits)
|
|
# Note: will have to be queried for, not automatic
|
|
fullword:
|
|
type: text
|
|
analyzer: my_fullword_index_analyzer
|
|
# Stored for exact phrase matching
|
|
exact:
|
|
type: text
|
|
analyzer: exact_analyzer
|
|
created_time:
|
|
type: date
|
|
# Only in the ES index for generating magnet links
|
|
info_hash:
|
|
enabled: false
|
|
filesize:
|
|
type: long
|
|
anonymous:
|
|
type: boolean
|
|
trusted:
|
|
type: boolean
|
|
remake:
|
|
type: boolean
|
|
complete:
|
|
type: boolean
|
|
hidden:
|
|
type: boolean
|
|
deleted:
|
|
type: boolean
|
|
has_torrent:
|
|
type: boolean
|
|
download_count:
|
|
type: long
|
|
leech_count:
|
|
type: long
|
|
seed_count:
|
|
type: long
|
|
comment_count:
|
|
type: long
|
|
# these ids are really only for filtering, thus keyword
|
|
uploader_id:
|
|
type: keyword
|
|
main_category_id:
|
|
type: keyword
|
|
sub_category_id:
|
|
type: keyword |