From 4fcef92b9450b0c184fd52da760fbac92b61ff6b Mon Sep 17 00:00:00 2001
From: queue <queue@hakase.org>
Date: Sun, 12 Jul 2020 01:10:47 -0600
Subject: [PATCH] elasticsearch 7.x compatability (#576)

* es_mapping: update turning off dynamic mappings

they changed it in 6.x

https://www.elastic.co/guide/en/elasticsearch/reference/current/dynamic.html
https://github.com/elastic/elasticsearch/pull/25734

* es_mapping: remove _all field

deprecated in 6.0 anyway

* es_mapping.yml: fix deprecated mapping type

https://www.elastic.co/guide/en/elasticsearch/reference/6.7/removal-of-types.html#_schedule_for_removal_of_mapping_types

it gives a really unhelpful error otherwise, oof.

* es: fix remaining 7.xisms

the enabled: false apparently only applies to
"object" fields now, need index: false

and the _type got removed everywhere. Seems to work now.

* Fix weird offset error with word_delimiter_graph

yet another es7-ism i guess

* Fix warning and some app stuff for ES 7.x

Co-authored-by: Arylide <Arylide@users.noreply.github.com>
---
 create_es.sh                       |   1 +
 es_mapping.yml                     | 132 ++++++++++++++---------------
 import_to_es.py                    |   1 -
 nyaa/templates/search_results.html |   2 +-
 nyaa/views/main.py                 |   2 +-
 sync_es.py                         |   3 -
 6 files changed, 68 insertions(+), 73 deletions(-)

diff --git a/create_es.sh b/create_es.sh
index 44510f4..42aaa7a 100755
--- a/create_es.sh
+++ b/create_es.sh
@@ -1,4 +1,5 @@
 #!/usr/bin/env bash
+set -e
 
 # create indices named "nyaa" and "sukebei", these are hardcoded
 curl -v -XPUT 'localhost:9200/nyaa?pretty' -H"Content-Type: application/yaml" --data-binary @es_mapping.yml
diff --git a/es_mapping.yml b/es_mapping.yml
index 14983d5..28462f6 100644
--- a/es_mapping.yml
+++ b/es_mapping.yml
@@ -10,7 +10,6 @@ settings:
         char_filter:
           - my_char_filter
         filter:
-          - standard
           - lowercase
       my_index_analyzer:
         type: custom
@@ -52,7 +51,7 @@ settings:
 
     filter:
       my_ngram:
-        type: edgeNGram
+        type: edge_ngram
         min_gram: 1
         max_gram: 15
       fullword_min:
@@ -66,9 +65,13 @@ settings:
         type: pattern_capture
         patterns: ["0*([0-9]*)"]
       word_delimit:
-        type: word_delimiter
+        type: word_delimiter_graph
         preserve_original: true
         split_on_numerics: false
+        # https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-word-delimiter-graph-tokenfilter.html#word-delimiter-graph-tokenfilter-configure-parms
+        # since we're using "trim" filters downstream, otherwise
+        # you get weird lucene errors about startOffset
+        adjust_offsets: false
     char_filter:
       my_char_filter:
         type: mapping
@@ -78,70 +81,65 @@ settings:
     # plus replicas don't really help either.
     number_of_shards: 1
     number_of_replicas : 0
-    mapper:
-      # disable elasticsearch's "helpful" autoschema
-      dynamic: false
-    # since we disabled the _all field, default query the
-    # name of the torrent.
     query:
       default_field: display_name
 mappings:
-  torrent:
-    # don't want everything concatenated
-    _all:
-      enabled: false
-    properties:
-      id:
-        type: long
-      display_name:
-        # TODO could do a fancier tokenizer here to parse out the
-        # the scene convention of stuff in brackets, plus stuff like k-on
-        type: text
-        analyzer: my_index_analyzer
-        fielddata: true # Is this required?
-        fields:
-          # Multi-field for full-word matching (when going over ngram limits)
-          # Note: will have to be queried for, not automatic
-          fullword:
-            type: text
-            analyzer: my_fullword_index_analyzer
-          # Stored for exact phrase matching
-          exact:
-            type: text
-            analyzer: exact_analyzer
-      created_time:
-        type: date
-        # Only in the ES index for generating magnet links
-      info_hash:
-        enabled: false
-      filesize:
-        type: long
-      anonymous:
-        type: boolean
-      trusted:
-        type: boolean
-      remake:
-        type: boolean
-      complete:
-        type: boolean
-      hidden:
-        type: boolean
-      deleted:
-        type: boolean
-      has_torrent:
-        type: boolean
-      download_count:
-        type: long
-      leech_count:
-        type: long
-      seed_count:
-        type: long
-      comment_count:
-        type: long
-      # these ids are really only for filtering, thus keyword
-      uploader_id:
-        type: keyword
-      main_category_id:
-        type: keyword
-      sub_category_id:
-        type: keyword
\ No newline at end of file
+  # disable elasticsearch's "helpful" autoschema
+  dynamic: false
+  properties:
+    id:
+      type: long
+    display_name:
+      # TODO could do a fancier tokenizer here to parse out the
+      # the scene convention of stuff in brackets, plus stuff like k-on
+      type: text
+      analyzer: my_index_analyzer
+      fielddata: true # Is this required?
+      fields:
+        # Multi-field for full-word matching (when going over ngram limits)
+        # Note: will have to be queried for, not automatic
+        fullword:
+          type: text
+          analyzer: my_fullword_index_analyzer
+        # Stored for exact phrase matching
+        exact:
+          type: text
+          analyzer: exact_analyzer
+    created_time:
+      type: date
+      #
+    # Only in the ES index for generating magnet links
+    info_hash:
+      type: keyword
+      index: false
+    filesize:
+      type: long
+    anonymous:
+      type: boolean
+    trusted:
+      type: boolean
+    remake:
+      type: boolean
+    complete:
+      type: boolean
+    hidden:
+      type: boolean
+    deleted:
+      type: boolean
+    has_torrent:
+      type: boolean
+    download_count:
+      type: long
+    leech_count:
+      type: long
+    seed_count:
+      type: long
+    comment_count:
+      type: long
+    # these ids are really only for filtering, thus keyword
+    uploader_id:
+      type: keyword
+    main_category_id:
+      type: keyword
+    sub_category_id:
+      type: keyword
diff --git a/import_to_es.py b/import_to_es.py
index c244abb..6717100 100755
--- a/import_to_es.py
+++ b/import_to_es.py
@@ -34,7 +34,6 @@ def pad_bytes(in_bytes, size):
 def mk_es(t, index_name):
     return {
         "_id": t.id,
-        "_type": "torrent",
         "_index": index_name,
         "_source": {
             # we're also indexing the id as a number so you can
diff --git a/nyaa/templates/search_results.html b/nyaa/templates/search_results.html
index 25b7142..76ac131 100644
--- a/nyaa/templates/search_results.html
+++ b/nyaa/templates/search_results.html
@@ -17,7 +17,7 @@
 {% endif %}
 {% endif %}
 
-{% if (use_elastic and torrent_query.hits.total > 0) or (torrent_query.items) %}
+{% if (use_elastic and torrent_query.hits.total.value > 0) or (torrent_query.items) %}
 <div class="table-responsive">
 	<table class="table table-bordered table-hover table-striped torrent-list">
 		<thead>
diff --git a/nyaa/views/main.py b/nyaa/views/main.py
index 8dfe38f..a1dae5a 100644
--- a/nyaa/views/main.py
+++ b/nyaa/views/main.py
@@ -167,7 +167,7 @@ def home(rss):
         else:
             rss_query_string = _generate_query_string(
                 search_term, category, quality_filter, user_name)
-            max_results = min(max_search_results, query_results['hits']['total'])
+            max_results = min(max_search_results, query_results['hits']['total']['value'])
             # change p= argument to whatever you change page_parameter to or pagination breaks
             pagination = Pagination(p=query_args['page'], per_page=results_per_page,
                                     total=max_results, bs_version=3, page_parameter='p',
diff --git a/sync_es.py b/sync_es.py
index 382c744..aa1adcb 100755
--- a/sync_es.py
+++ b/sync_es.py
@@ -114,7 +114,6 @@ def reindex_torrent(t, index_name):
     return {
         '_op_type': 'update',
         '_index': index_name,
-        '_type': 'torrent',
         '_id': str(t['id']),
         "doc": doc,
         "doc_as_upsert": True
@@ -128,7 +127,6 @@ def reindex_stats(s, index_name):
     return {
         '_op_type': 'update',
         '_index': index_name,
-        '_type': 'torrent',
         '_id': str(s['torrent_id']),
         "doc": {
             "stats_last_updated": s["last_updated"],
@@ -141,7 +139,6 @@ def delet_this(row, index_name):
     return {
         "_op_type": 'delete',
         '_index': index_name,
-        '_type': 'torrent',
         '_id': str(row['values']['id'])}
 
 # we could try to make this script robust to errors from es or mysql, but since