mirror of
https://gitlab.com/SIGBUS/nyaa.git
synced 2024-11-14 17:09:14 +00:00
4fcef92b94
* es_mapping: update turning off dynamic mappings they changed it in 6.x https://www.elastic.co/guide/en/elasticsearch/reference/current/dynamic.html https://github.com/elastic/elasticsearch/pull/25734 * es_mapping: remove _all field deprecated in 6.0 anyway * es_mapping.yml: fix deprecated mapping type https://www.elastic.co/guide/en/elasticsearch/reference/6.7/removal-of-types.html#_schedule_for_removal_of_mapping_types it gives a really unhelpful error otherwise, oof. * es: fix remaining 7.xisms the enabled: false apparently only applies to "object" fields now, need index: false and the _type got removed everywhere. Seems to work now. * Fix weird offset error with word_delimiter_graph yet another es7-ism i guess * Fix warning and some app stuff for ES 7.x Co-authored-by: Arylide <Arylide@users.noreply.github.com>
146 lines
3.8 KiB
YAML
146 lines
3.8 KiB
YAML
---
|
|
# CREATE DTABASE/TABLE equivalent for elasticsearch, in yaml
|
|
# fo inline comments.
|
|
settings:
|
|
analysis:
|
|
analyzer:
|
|
my_search_analyzer:
|
|
type: custom
|
|
tokenizer: standard
|
|
char_filter:
|
|
- my_char_filter
|
|
filter:
|
|
- lowercase
|
|
my_index_analyzer:
|
|
type: custom
|
|
tokenizer: standard
|
|
char_filter:
|
|
- my_char_filter
|
|
filter:
|
|
- resolution
|
|
- lowercase
|
|
- word_delimit
|
|
- my_ngram
|
|
- trim_zero
|
|
- unique
|
|
# For exact matching - separate each character for substring matching + lowercase
|
|
exact_analyzer:
|
|
tokenizer: exact_tokenizer
|
|
filter:
|
|
- lowercase
|
|
# For matching full words longer than the ngram limit (15 chars)
|
|
my_fullword_index_analyzer:
|
|
type: custom
|
|
tokenizer: standard
|
|
char_filter:
|
|
- my_char_filter
|
|
filter:
|
|
- lowercase
|
|
- word_delimit
|
|
# Skip tokens shorter than N characters,
|
|
# since they're already indexed in the main field
|
|
- fullword_min
|
|
- unique
|
|
|
|
tokenizer:
|
|
# Splits input into characters, for exact substring matching
|
|
exact_tokenizer:
|
|
type: pattern
|
|
pattern: "(.)"
|
|
group: 1
|
|
|
|
filter:
|
|
my_ngram:
|
|
type: edge_ngram
|
|
min_gram: 1
|
|
max_gram: 15
|
|
fullword_min:
|
|
type: length
|
|
# Remember to change this if you change the max_gram below!
|
|
min: 16
|
|
resolution:
|
|
type: pattern_capture
|
|
patterns: ["(\\d+)[xX](\\d+)"]
|
|
trim_zero:
|
|
type: pattern_capture
|
|
patterns: ["0*([0-9]*)"]
|
|
word_delimit:
|
|
type: word_delimiter_graph
|
|
preserve_original: true
|
|
split_on_numerics: false
|
|
# https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-word-delimiter-graph-tokenfilter.html#word-delimiter-graph-tokenfilter-configure-parms
|
|
# since we're using "trim" filters downstream, otherwise
|
|
# you get weird lucene errors about startOffset
|
|
adjust_offsets: false
|
|
char_filter:
|
|
my_char_filter:
|
|
type: mapping
|
|
mappings: ["-=>_", "!=>_", "_=>\\u0020"]
|
|
index:
|
|
# we're running a single es node, so no sharding necessary,
|
|
# plus replicas don't really help either.
|
|
number_of_shards: 1
|
|
number_of_replicas : 0
|
|
query:
|
|
default_field: display_name
|
|
mappings:
|
|
# disable elasticsearch's "helpful" autoschema
|
|
dynamic: false
|
|
properties:
|
|
id:
|
|
type: long
|
|
display_name:
|
|
# TODO could do a fancier tokenizer here to parse out the
|
|
# the scene convention of stuff in brackets, plus stuff like k-on
|
|
type: text
|
|
analyzer: my_index_analyzer
|
|
fielddata: true # Is this required?
|
|
fields:
|
|
# Multi-field for full-word matching (when going over ngram limits)
|
|
# Note: will have to be queried for, not automatic
|
|
fullword:
|
|
type: text
|
|
analyzer: my_fullword_index_analyzer
|
|
# Stored for exact phrase matching
|
|
exact:
|
|
type: text
|
|
analyzer: exact_analyzer
|
|
created_time:
|
|
type: date
|
|
#
|
|
# Only in the ES index for generating magnet links
|
|
info_hash:
|
|
type: keyword
|
|
index: false
|
|
filesize:
|
|
type: long
|
|
anonymous:
|
|
type: boolean
|
|
trusted:
|
|
type: boolean
|
|
remake:
|
|
type: boolean
|
|
complete:
|
|
type: boolean
|
|
hidden:
|
|
type: boolean
|
|
deleted:
|
|
type: boolean
|
|
has_torrent:
|
|
type: boolean
|
|
download_count:
|
|
type: long
|
|
leech_count:
|
|
type: long
|
|
seed_count:
|
|
type: long
|
|
comment_count:
|
|
type: long
|
|
# these ids are really only for filtering, thus keyword
|
|
uploader_id:
|
|
type: keyword
|
|
main_category_id:
|
|
type: keyword
|
|
sub_category_id:
|
|
type: keyword
|