From 4fcef92b9450b0c184fd52da760fbac92b61ff6b Mon Sep 17 00:00:00 2001 From: queue Date: Sun, 12 Jul 2020 01:10:47 -0600 Subject: [PATCH] elasticsearch 7.x compatability (#576) * es_mapping: update turning off dynamic mappings they changed it in 6.x https://www.elastic.co/guide/en/elasticsearch/reference/current/dynamic.html https://github.com/elastic/elasticsearch/pull/25734 * es_mapping: remove _all field deprecated in 6.0 anyway * es_mapping.yml: fix deprecated mapping type https://www.elastic.co/guide/en/elasticsearch/reference/6.7/removal-of-types.html#_schedule_for_removal_of_mapping_types it gives a really unhelpful error otherwise, oof. * es: fix remaining 7.xisms the enabled: false apparently only applies to "object" fields now, need index: false and the _type got removed everywhere. Seems to work now. * Fix weird offset error with word_delimiter_graph yet another es7-ism i guess * Fix warning and some app stuff for ES 7.x Co-authored-by: Arylide --- create_es.sh | 1 + es_mapping.yml | 132 ++++++++++++++--------------- import_to_es.py | 1 - nyaa/templates/search_results.html | 2 +- nyaa/views/main.py | 2 +- sync_es.py | 3 - 6 files changed, 68 insertions(+), 73 deletions(-) diff --git a/create_es.sh b/create_es.sh index 44510f4..42aaa7a 100755 --- a/create_es.sh +++ b/create_es.sh @@ -1,4 +1,5 @@ #!/usr/bin/env bash +set -e # create indices named "nyaa" and "sukebei", these are hardcoded curl -v -XPUT 'localhost:9200/nyaa?pretty' -H"Content-Type: application/yaml" --data-binary @es_mapping.yml diff --git a/es_mapping.yml b/es_mapping.yml index 14983d5..28462f6 100644 --- a/es_mapping.yml +++ b/es_mapping.yml @@ -10,7 +10,6 @@ settings: char_filter: - my_char_filter filter: - - standard - lowercase my_index_analyzer: type: custom @@ -52,7 +51,7 @@ settings: filter: my_ngram: - type: edgeNGram + type: edge_ngram min_gram: 1 max_gram: 15 fullword_min: @@ -66,9 +65,13 @@ settings: type: pattern_capture patterns: ["0*([0-9]*)"] word_delimit: - type: word_delimiter + type: word_delimiter_graph preserve_original: true split_on_numerics: false + # https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-word-delimiter-graph-tokenfilter.html#word-delimiter-graph-tokenfilter-configure-parms + # since we're using "trim" filters downstream, otherwise + # you get weird lucene errors about startOffset + adjust_offsets: false char_filter: my_char_filter: type: mapping @@ -78,70 +81,65 @@ settings: # plus replicas don't really help either. number_of_shards: 1 number_of_replicas : 0 - mapper: - # disable elasticsearch's "helpful" autoschema - dynamic: false - # since we disabled the _all field, default query the - # name of the torrent. query: default_field: display_name mappings: - torrent: - # don't want everything concatenated - _all: - enabled: false - properties: - id: - type: long - display_name: - # TODO could do a fancier tokenizer here to parse out the - # the scene convention of stuff in brackets, plus stuff like k-on - type: text - analyzer: my_index_analyzer - fielddata: true # Is this required? - fields: - # Multi-field for full-word matching (when going over ngram limits) - # Note: will have to be queried for, not automatic - fullword: - type: text - analyzer: my_fullword_index_analyzer - # Stored for exact phrase matching - exact: - type: text - analyzer: exact_analyzer - created_time: - type: date - # Only in the ES index for generating magnet links - info_hash: - enabled: false - filesize: - type: long - anonymous: - type: boolean - trusted: - type: boolean - remake: - type: boolean - complete: - type: boolean - hidden: - type: boolean - deleted: - type: boolean - has_torrent: - type: boolean - download_count: - type: long - leech_count: - type: long - seed_count: - type: long - comment_count: - type: long - # these ids are really only for filtering, thus keyword - uploader_id: - type: keyword - main_category_id: - type: keyword - sub_category_id: - type: keyword \ No newline at end of file + # disable elasticsearch's "helpful" autoschema + dynamic: false + properties: + id: + type: long + display_name: + # TODO could do a fancier tokenizer here to parse out the + # the scene convention of stuff in brackets, plus stuff like k-on + type: text + analyzer: my_index_analyzer + fielddata: true # Is this required? + fields: + # Multi-field for full-word matching (when going over ngram limits) + # Note: will have to be queried for, not automatic + fullword: + type: text + analyzer: my_fullword_index_analyzer + # Stored for exact phrase matching + exact: + type: text + analyzer: exact_analyzer + created_time: + type: date + # + # Only in the ES index for generating magnet links + info_hash: + type: keyword + index: false + filesize: + type: long + anonymous: + type: boolean + trusted: + type: boolean + remake: + type: boolean + complete: + type: boolean + hidden: + type: boolean + deleted: + type: boolean + has_torrent: + type: boolean + download_count: + type: long + leech_count: + type: long + seed_count: + type: long + comment_count: + type: long + # these ids are really only for filtering, thus keyword + uploader_id: + type: keyword + main_category_id: + type: keyword + sub_category_id: + type: keyword diff --git a/import_to_es.py b/import_to_es.py index c244abb..6717100 100755 --- a/import_to_es.py +++ b/import_to_es.py @@ -34,7 +34,6 @@ def pad_bytes(in_bytes, size): def mk_es(t, index_name): return { "_id": t.id, - "_type": "torrent", "_index": index_name, "_source": { # we're also indexing the id as a number so you can diff --git a/nyaa/templates/search_results.html b/nyaa/templates/search_results.html index 25b7142..76ac131 100644 --- a/nyaa/templates/search_results.html +++ b/nyaa/templates/search_results.html @@ -17,7 +17,7 @@ {% endif %} {% endif %} -{% if (use_elastic and torrent_query.hits.total > 0) or (torrent_query.items) %} +{% if (use_elastic and torrent_query.hits.total.value > 0) or (torrent_query.items) %}
diff --git a/nyaa/views/main.py b/nyaa/views/main.py index 8dfe38f..a1dae5a 100644 --- a/nyaa/views/main.py +++ b/nyaa/views/main.py @@ -167,7 +167,7 @@ def home(rss): else: rss_query_string = _generate_query_string( search_term, category, quality_filter, user_name) - max_results = min(max_search_results, query_results['hits']['total']) + max_results = min(max_search_results, query_results['hits']['total']['value']) # change p= argument to whatever you change page_parameter to or pagination breaks pagination = Pagination(p=query_args['page'], per_page=results_per_page, total=max_results, bs_version=3, page_parameter='p', diff --git a/sync_es.py b/sync_es.py index 382c744..aa1adcb 100755 --- a/sync_es.py +++ b/sync_es.py @@ -114,7 +114,6 @@ def reindex_torrent(t, index_name): return { '_op_type': 'update', '_index': index_name, - '_type': 'torrent', '_id': str(t['id']), "doc": doc, "doc_as_upsert": True @@ -128,7 +127,6 @@ def reindex_stats(s, index_name): return { '_op_type': 'update', '_index': index_name, - '_type': 'torrent', '_id': str(s['torrent_id']), "doc": { "stats_last_updated": s["last_updated"], @@ -141,7 +139,6 @@ def delet_this(row, index_name): return { "_op_type": 'delete', '_index': index_name, - '_type': 'torrent', '_id': str(row['values']['id'])} # we could try to make this script robust to errors from es or mysql, but since