1
0
Fork 0
mirror of https://gitlab.com/SIGBUS/nyaa.git synced 2024-12-22 04:20:00 +00:00
nyaa/es_mapping.yml
Anna-Maria Meriniemi bc1901baa5 ES: implement real substring matching (#500)
...by splitting input into characters, instead of whitespace delimited
words. This means you can now match partial words, real substrings from
anywhere: "foo ba" will match "Foo Bar Baz", while previously you had to
have full words ("foo bar") to match anything.

My dev setup incurred an 8% increase in storage usage, from ~13MB to
~14MB (for ~40k torrents).
Small change, big improvement. Wonder why I didn't do this at first.
2018-06-08 00:59:19 -07:00

147 lines
3.8 KiB
YAML

---
# CREATE DTABASE/TABLE equivalent for elasticsearch, in yaml
# fo inline comments.
settings:
analysis:
analyzer:
my_search_analyzer:
type: custom
tokenizer: standard
char_filter:
- my_char_filter
filter:
- standard
- lowercase
my_index_analyzer:
type: custom
tokenizer: standard
char_filter:
- my_char_filter
filter:
- resolution
- lowercase
- word_delimit
- my_ngram
- trim_zero
- unique
# For exact matching - separate each character for substring matching + lowercase
exact_analyzer:
tokenizer: exact_tokenizer
filter:
- lowercase
# For matching full words longer than the ngram limit (15 chars)
my_fullword_index_analyzer:
type: custom
tokenizer: standard
char_filter:
- my_char_filter
filter:
- lowercase
- word_delimit
# Skip tokens shorter than N characters,
# since they're already indexed in the main field
- fullword_min
- unique
tokenizer:
# Splits input into characters, for exact substring matching
exact_tokenizer:
type: pattern
pattern: "(.)"
group: 1
filter:
my_ngram:
type: edgeNGram
min_gram: 1
max_gram: 15
fullword_min:
type: length
# Remember to change this if you change the max_gram below!
min: 16
resolution:
type: pattern_capture
patterns: ["(\\d+)[xX](\\d+)"]
trim_zero:
type: pattern_capture
patterns: ["0*([0-9]*)"]
word_delimit:
type: word_delimiter
preserve_original: true
split_on_numerics: false
char_filter:
my_char_filter:
type: mapping
mappings: ["-=>_", "!=>_", "_=>\\u0020"]
index:
# we're running a single es node, so no sharding necessary,
# plus replicas don't really help either.
number_of_shards: 1
number_of_replicas : 0
mapper:
# disable elasticsearch's "helpful" autoschema
dynamic: false
# since we disabled the _all field, default query the
# name of the torrent.
query:
default_field: display_name
mappings:
torrent:
# don't want everything concatenated
_all:
enabled: false
properties:
id:
type: long
display_name:
# TODO could do a fancier tokenizer here to parse out the
# the scene convention of stuff in brackets, plus stuff like k-on
type: text
analyzer: my_index_analyzer
fielddata: true # Is this required?
fields:
# Multi-field for full-word matching (when going over ngram limits)
# Note: will have to be queried for, not automatic
fullword:
type: text
analyzer: my_fullword_index_analyzer
# Stored for exact phrase matching
exact:
type: text
analyzer: exact_analyzer
created_time:
type: date
# Only in the ES index for generating magnet links
info_hash:
enabled: false
filesize:
type: long
anonymous:
type: boolean
trusted:
type: boolean
remake:
type: boolean
complete:
type: boolean
hidden:
type: boolean
deleted:
type: boolean
has_torrent:
type: boolean
download_count:
type: long
leech_count:
type: long
seed_count:
type: long
comment_count:
type: long
# these ids are really only for filtering, thus keyword
uploader_id:
type: keyword
main_category_id:
type: keyword
sub_category_id:
type: keyword