mirror of
https://gitlab.com/SIGBUS/nyaa.git
synced 2024-11-05 09:25:54 +00:00
bc1901baa5
...by splitting input into characters, instead of whitespace delimited words. This means you can now match partial words, real substrings from anywhere: "foo ba" will match "Foo Bar Baz", while previously you had to have full words ("foo bar") to match anything. My dev setup incurred an 8% increase in storage usage, from ~13MB to ~14MB (for ~40k torrents). Small change, big improvement. Wonder why I didn't do this at first.
147 lines
3.8 KiB
YAML
147 lines
3.8 KiB
YAML
---
|
|
# CREATE DTABASE/TABLE equivalent for elasticsearch, in yaml
|
|
# fo inline comments.
|
|
settings:
|
|
analysis:
|
|
analyzer:
|
|
my_search_analyzer:
|
|
type: custom
|
|
tokenizer: standard
|
|
char_filter:
|
|
- my_char_filter
|
|
filter:
|
|
- standard
|
|
- lowercase
|
|
my_index_analyzer:
|
|
type: custom
|
|
tokenizer: standard
|
|
char_filter:
|
|
- my_char_filter
|
|
filter:
|
|
- resolution
|
|
- lowercase
|
|
- word_delimit
|
|
- my_ngram
|
|
- trim_zero
|
|
- unique
|
|
# For exact matching - separate each character for substring matching + lowercase
|
|
exact_analyzer:
|
|
tokenizer: exact_tokenizer
|
|
filter:
|
|
- lowercase
|
|
# For matching full words longer than the ngram limit (15 chars)
|
|
my_fullword_index_analyzer:
|
|
type: custom
|
|
tokenizer: standard
|
|
char_filter:
|
|
- my_char_filter
|
|
filter:
|
|
- lowercase
|
|
- word_delimit
|
|
# Skip tokens shorter than N characters,
|
|
# since they're already indexed in the main field
|
|
- fullword_min
|
|
- unique
|
|
|
|
tokenizer:
|
|
# Splits input into characters, for exact substring matching
|
|
exact_tokenizer:
|
|
type: pattern
|
|
pattern: "(.)"
|
|
group: 1
|
|
|
|
filter:
|
|
my_ngram:
|
|
type: edgeNGram
|
|
min_gram: 1
|
|
max_gram: 15
|
|
fullword_min:
|
|
type: length
|
|
# Remember to change this if you change the max_gram below!
|
|
min: 16
|
|
resolution:
|
|
type: pattern_capture
|
|
patterns: ["(\\d+)[xX](\\d+)"]
|
|
trim_zero:
|
|
type: pattern_capture
|
|
patterns: ["0*([0-9]*)"]
|
|
word_delimit:
|
|
type: word_delimiter
|
|
preserve_original: true
|
|
split_on_numerics: false
|
|
char_filter:
|
|
my_char_filter:
|
|
type: mapping
|
|
mappings: ["-=>_", "!=>_", "_=>\\u0020"]
|
|
index:
|
|
# we're running a single es node, so no sharding necessary,
|
|
# plus replicas don't really help either.
|
|
number_of_shards: 1
|
|
number_of_replicas : 0
|
|
mapper:
|
|
# disable elasticsearch's "helpful" autoschema
|
|
dynamic: false
|
|
# since we disabled the _all field, default query the
|
|
# name of the torrent.
|
|
query:
|
|
default_field: display_name
|
|
mappings:
|
|
torrent:
|
|
# don't want everything concatenated
|
|
_all:
|
|
enabled: false
|
|
properties:
|
|
id:
|
|
type: long
|
|
display_name:
|
|
# TODO could do a fancier tokenizer here to parse out the
|
|
# the scene convention of stuff in brackets, plus stuff like k-on
|
|
type: text
|
|
analyzer: my_index_analyzer
|
|
fielddata: true # Is this required?
|
|
fields:
|
|
# Multi-field for full-word matching (when going over ngram limits)
|
|
# Note: will have to be queried for, not automatic
|
|
fullword:
|
|
type: text
|
|
analyzer: my_fullword_index_analyzer
|
|
# Stored for exact phrase matching
|
|
exact:
|
|
type: text
|
|
analyzer: exact_analyzer
|
|
created_time:
|
|
type: date
|
|
# Only in the ES index for generating magnet links
|
|
info_hash:
|
|
enabled: false
|
|
filesize:
|
|
type: long
|
|
anonymous:
|
|
type: boolean
|
|
trusted:
|
|
type: boolean
|
|
remake:
|
|
type: boolean
|
|
complete:
|
|
type: boolean
|
|
hidden:
|
|
type: boolean
|
|
deleted:
|
|
type: boolean
|
|
has_torrent:
|
|
type: boolean
|
|
download_count:
|
|
type: long
|
|
leech_count:
|
|
type: long
|
|
seed_count:
|
|
type: long
|
|
comment_count:
|
|
type: long
|
|
# these ids are really only for filtering, thus keyword
|
|
uploader_id:
|
|
type: keyword
|
|
main_category_id:
|
|
type: keyword
|
|
sub_category_id:
|
|
type: keyword |