elasticsearch 7.x compatability (#576)

* es_mapping: update turning off dynamic mappings they changed it in 6.x https://www.elastic.co/guide/en/elasticsearch/reference/current/dynamic.html https://github.com/elastic/elasticsearch/pull/25734 * es_mapping: remove _all field deprecated in 6.0 anyway * es_mapping.yml: fix deprecated mapping type https://www.elastic.co/guide/en/elasticsearch/reference/6.7/removal-of-types.html#_schedule_for_removal_of_mapping_types it gives a really unhelpful error otherwise, oof. * es: fix remaining 7.xisms the enabled: false apparently only applies to "object" fields now, need index: false and the _type got removed everywhere. Seems to work now. * Fix weird offset error with word_delimiter_graph yet another es7-ism i guess * Fix warning and some app stuff for ES 7.x Co-authored-by: Arylide <Arylide@users.noreply.github.com>
2025-04-23 03:52:56 +00:00 · 2020-07-12 01:10:47 -06:00 · 2020-07-12 01:10:47 -06:00 · 4fcef92b94
parent 72087ddaaf
commit 4fcef92b94
6 changed files with 68 additions and 73 deletions
--- a/create_es.sh
+++ b/create_es.sh
@ -1,4 +1,5 @@
 #!/usr/bin/env bash
+set -e

 # create indices named "nyaa" and "sukebei", these are hardcoded
 curl -v -XPUT 'localhost:9200/nyaa?pretty' -H"Content-Type: application/yaml" --data-binary @es_mapping.yml
--- a/es_mapping.yml
+++ b/es_mapping.yml
@ -10,7 +10,6 @@ settings:
        char_filter:
          - my_char_filter
        filter:
-          - standard
          - lowercase
      my_index_analyzer:
        type: custom
@ -52,7 +51,7 @@ settings:

    filter:
      my_ngram:
-        type: edgeNGram
+        type: edge_ngram
        min_gram: 1
        max_gram: 15
      fullword_min:
@ -66,9 +65,13 @@ settings:
        type: pattern_capture
        patterns: ["0*([0-9]*)"]
      word_delimit:
-        type: word_delimiter
+        type: word_delimiter_graph
        preserve_original: true
        split_on_numerics: false
+        # https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-word-delimiter-graph-tokenfilter.html#word-delimiter-graph-tokenfilter-configure-parms
+        # since we're using "trim" filters downstream, otherwise
+        # you get weird lucene errors about startOffset
+        adjust_offsets: false
    char_filter:
      my_char_filter:
        type: mapping
@ -78,70 +81,65 @@ settings:
    # plus replicas don't really help either.
    number_of_shards: 1
    number_of_replicas : 0
-    mapper:
-      # disable elasticsearch's "helpful" autoschema
-      dynamic: false
-    # since we disabled the _all field, default query the
-    # name of the torrent.
    query:
      default_field: display_name
 mappings:
-  torrent:
-    # don't want everything concatenated
-    _all:
-      enabled: false
-    properties:
-      id:
-        type: long
-      display_name:
-        # TODO could do a fancier tokenizer here to parse out the
-        # the scene convention of stuff in brackets, plus stuff like k-on
-        type: text
-        analyzer: my_index_analyzer
-        fielddata: true # Is this required?
-        fields:
-          # Multi-field for full-word matching (when going over ngram limits)
-          # Note: will have to be queried for, not automatic
-          fullword:
-            type: text
-            analyzer: my_fullword_index_analyzer
-          # Stored for exact phrase matching
-          exact:
-            type: text
-            analyzer: exact_analyzer
-      created_time:
-        type: date
-        # Only in the ES index for generating magnet links
-      info_hash:
-        enabled: false
-      filesize:
-        type: long
-      anonymous:
-        type: boolean
-      trusted:
-        type: boolean
-      remake:
-        type: boolean
-      complete:
-        type: boolean
-      hidden:
-        type: boolean
-      deleted:
-        type: boolean
-      has_torrent:
-        type: boolean
-      download_count:
-        type: long
-      leech_count:
-        type: long
-      seed_count:
-        type: long
-      comment_count:
-        type: long
-      # these ids are really only for filtering, thus keyword
-      uploader_id:
-        type: keyword
-      main_category_id:
-        type: keyword
-      sub_category_id:
-        type: keyword
+  # disable elasticsearch's "helpful" autoschema
+  dynamic: false
+  properties:
+    id:
+      type: long
+    display_name:
+      # TODO could do a fancier tokenizer here to parse out the
+      # the scene convention of stuff in brackets, plus stuff like k-on
+      type: text
+      analyzer: my_index_analyzer
+      fielddata: true # Is this required?
+      fields:
+        # Multi-field for full-word matching (when going over ngram limits)
+        # Note: will have to be queried for, not automatic
+        fullword:
+          type: text
+          analyzer: my_fullword_index_analyzer
+        # Stored for exact phrase matching
+        exact:
+          type: text
+          analyzer: exact_analyzer
+    created_time:
+      type: date
+      #
+    # Only in the ES index for generating magnet links
+    info_hash:
+      type: keyword
+      index: false
+    filesize:
+      type: long
+    anonymous:
+      type: boolean
+    trusted:
+      type: boolean
+    remake:
+      type: boolean
+    complete:
+      type: boolean
+    hidden:
+      type: boolean
+    deleted:
+      type: boolean
+    has_torrent:
+      type: boolean
+    download_count:
+      type: long
+    leech_count:
+      type: long
+    seed_count:
+      type: long
+    comment_count:
+      type: long
+    # these ids are really only for filtering, thus keyword
+    uploader_id:
+      type: keyword
+    main_category_id:
+      type: keyword
+    sub_category_id:
+      type: keyword
--- a/import_to_es.py
+++ b/import_to_es.py
@ -34,7 +34,6 @@ def pad_bytes(in_bytes, size):
 def mk_es(t, index_name):
    return {
        "_id": t.id,
-        "_type": "torrent",
        "_index": index_name,
        "_source": {
            # we're also indexing the id as a number so you can
--- a/nyaa/templates/search_results.html
+++ b/nyaa/templates/search_results.html
@ -17,7 +17,7 @@
 {% endif %}
 {% endif %}

-{% if (use_elastic and torrent_query.hits.total > 0) or (torrent_query.items) %}
+{% if (use_elastic and torrent_query.hits.total.value > 0) or (torrent_query.items) %}
 <div class="table-responsive">
 	<table class="table table-bordered table-hover table-striped torrent-list">
 		<thead>
--- a/nyaa/views/main.py
+++ b/nyaa/views/main.py
@ -167,7 +167,7 @@ def home(rss):
        else:
            rss_query_string = _generate_query_string(
                search_term, category, quality_filter, user_name)
-            max_results = min(max_search_results, query_results['hits']['total'])
+            max_results = min(max_search_results, query_results['hits']['total']['value'])
            # change p= argument to whatever you change page_parameter to or pagination breaks
            pagination = Pagination(p=query_args['page'], per_page=results_per_page,
                                    total=max_results, bs_version=3, page_parameter='p',
--- a/sync_es.py
+++ b/sync_es.py
@ -114,7 +114,6 @@ def reindex_torrent(t, index_name):
    return {
        '_op_type': 'update',
        '_index': index_name,
-        '_type': 'torrent',
        '_id': str(t['id']),
        "doc": doc,
        "doc_as_upsert": True
@ -128,7 +127,6 @@ def reindex_stats(s, index_name):
    return {
        '_op_type': 'update',
        '_index': index_name,
-        '_type': 'torrent',
        '_id': str(s['torrent_id']),
        "doc": {
            "stats_last_updated": s["last_updated"],
@ -141,7 +139,6 @@ def delet_this(row, index_name):
    return {
        "_op_type": 'delete',
        '_index': index_name,
-        '_type': 'torrent',
        '_id': str(row['values']['id'])}

 # we could try to make this script robust to errors from es or mysql, but since