mirror of
https://gitlab.com/SIGBUS/nyaa.git
synced 2024-10-31 23:45:54 +00:00
2d0cf7cbb4
This fixes searching for "Machiavellianism", 16 chars ("Machiavellianis", 15 chars, worked previously). Does not (seem to!) break anything, but requires a re-indexing of ES.
123 lines
3.1 KiB
YAML
123 lines
3.1 KiB
YAML
---
|
|
# CREATE DTABASE/TABLE equivalent for elasticsearch, in yaml
|
|
# fo inline comments.
|
|
settings:
|
|
analysis:
|
|
analyzer:
|
|
my_search_analyzer:
|
|
type: custom
|
|
tokenizer: standard
|
|
char_filter:
|
|
- my_char_filter
|
|
filter:
|
|
- standard
|
|
- lowercase
|
|
my_index_analyzer:
|
|
type: custom
|
|
tokenizer: standard
|
|
char_filter:
|
|
- my_char_filter
|
|
filter:
|
|
- resolution
|
|
- lowercase
|
|
- my_ngram
|
|
- word_delimit
|
|
- trim_zero
|
|
# For matching full words longer than the ngram limit (15 chars)
|
|
my_fullword_index_analyzer:
|
|
type: custom
|
|
tokenizer: standard
|
|
char_filter:
|
|
- my_char_filter
|
|
filter:
|
|
- lowercase
|
|
- word_delimit
|
|
# These should be enough, as my_index_analyzer will match the rest
|
|
|
|
filter:
|
|
my_ngram:
|
|
type: edgeNGram
|
|
min_gram: 1
|
|
max_gram: 15
|
|
resolution:
|
|
type: pattern_capture
|
|
patterns: ["(\\d+)[xX](\\d+)"]
|
|
trim_zero:
|
|
type: pattern_capture
|
|
patterns: ["0*([0-9]*)"]
|
|
word_delimit:
|
|
type: word_delimiter
|
|
preserve_original: true
|
|
split_on_numerics: false
|
|
char_filter:
|
|
my_char_filter:
|
|
type: mapping
|
|
mappings: ["-=>_", "!=>_", "_=>\\u0020"]
|
|
index:
|
|
# we're running a single es node, so no sharding necessary,
|
|
# plus replicas don't really help either.
|
|
number_of_shards: 1
|
|
number_of_replicas : 0
|
|
mapper:
|
|
# disable elasticsearch's "helpful" autoschema
|
|
dynamic: false
|
|
# since we disabled the _all field, default query the
|
|
# name of the torrent.
|
|
query:
|
|
default_field: display_name
|
|
mappings:
|
|
torrent:
|
|
# don't want everything concatenated
|
|
_all:
|
|
enabled: false
|
|
properties:
|
|
id:
|
|
type: long
|
|
display_name:
|
|
# TODO could do a fancier tokenizer here to parse out the
|
|
# the scene convention of stuff in brackets, plus stuff like k-on
|
|
type: text
|
|
analyzer: my_index_analyzer
|
|
fielddata: true # Is this required?
|
|
fields:
|
|
# Multi-field for full-word matching (when going over ngram limits)
|
|
# Note: will have to be queried for, not automatic
|
|
fullword:
|
|
type: text
|
|
analyzer: my_fullword_index_analyzer
|
|
created_time:
|
|
type: date
|
|
# Only in the ES index for generating magnet links
|
|
info_hash:
|
|
enabled: false
|
|
filesize:
|
|
type: long
|
|
anonymous:
|
|
type: boolean
|
|
trusted:
|
|
type: boolean
|
|
remake:
|
|
type: boolean
|
|
complete:
|
|
type: boolean
|
|
hidden:
|
|
type: boolean
|
|
deleted:
|
|
type: boolean
|
|
has_torrent:
|
|
type: boolean
|
|
download_count:
|
|
type: long
|
|
leech_count:
|
|
type: long
|
|
seed_count:
|
|
type: long
|
|
comment_count:
|
|
type: long
|
|
# these ids are really only for filtering, thus keyword
|
|
uploader_id:
|
|
type: keyword
|
|
main_category_id:
|
|
type: keyword
|
|
sub_category_id:
|
|
type: keyword |