From c2c547e7867742b06cd97c95210c864b0ed9789f Mon Sep 17 00:00:00 2001 From: aldacron Date: Mon, 15 May 2017 11:14:01 -0700 Subject: [PATCH] some more elasticsearch work, including index mapping and analyzer --- create_es.sh | 3 ++ es_mapping.yml | 91 ++++++++++++++++++++++++++++++++++++++++++++++++ import_to_es.py | 5 +-- nyaa/routes.py | 2 +- requirements.txt | 20 +---------- 5 files changed, 97 insertions(+), 24 deletions(-) create mode 100755 create_es.sh create mode 100644 es_mapping.yml diff --git a/create_es.sh b/create_es.sh new file mode 100755 index 0000000..2b83620 --- /dev/null +++ b/create_es.sh @@ -0,0 +1,3 @@ +#!/usr/bin/env bash + +curl -v -XPUT 'localhost:9200/nyaav2?pretty' -H"Content-Type: application/yaml" --data-binary @es_mapping.yml diff --git a/es_mapping.yml b/es_mapping.yml new file mode 100644 index 0000000..9085ec2 --- /dev/null +++ b/es_mapping.yml @@ -0,0 +1,91 @@ +--- +# CREATE DTABASE/TABLE equivalent for elasticsearch, in yaml +# fo inline comments. +settings: + analysis: + analyzer: + my_search_analyzer: + type: custom + tokenizer: standard + char_filter: + - my_char_filter + filter: + - standard + - lowercase + my_index_analyzer: + type: custom + tokenizer: standard + char_filter: + - my_char_filter + filter: + - lowercase + - my_ngram + filter: + my_ngram: + type: edgeNGram + min_gram: 1 + max_gram: 15 + char_filter: + my_char_filter: + type: mapping + mappings: ["-=>_", "!=>_"] + index: + # we're running a single es node, so no sharding necessary, + # plus replicas don't really help either. + number_of_shards: 1 + number_of_replicas : 0 + mapper: + # disable elasticsearch's "helpful" autoschema + dynamic: false + # since we disabled the _all field, default query the + # name of the torrent. + query: + default_field: display_name +mappings: + torrent: + # don't want everything concatenated + _all: + enabled: false + properties: + id: + type: long + display_name: + # TODO could do a fancier tokenizer here to parse out the + # the scene convention of stuff in brackets, plus stuff like k-on + type: text + analyzer: my_index_analyzer + fielddata: true + created_time: + type: date + # Only in the ES index for generating magnet links + info_hash: + enabled: false + filesize: + type: long + anonymous: + type: boolean + trusted: + type: boolean + remake: + type: boolean + complete: + type: boolean + hidden: + type: boolean + deleted: + type: boolean + has_torrent: + type: boolean + download_count: + type: long + leech_count: + type: long + seed_count: + type: long + # these ids are really only for filtering, thus keyword + uploader_id: + type: keyword + main_category_id: + type: keyword + sub_category_id: + type: keyword \ No newline at end of file diff --git a/import_to_es.py b/import_to_es.py index e714da5..046bde6 100644 --- a/import_to_es.py +++ b/import_to_es.py @@ -41,8 +41,6 @@ def mk_es(t): "id": t.id, "display_name": t.display_name, "created_time": t.created_time, - "updated_time": t.updated_time, - "description": t.description, # not analyzed but included so we can render magnet links # without querying sql again. "info_hash": t.info_hash.hex(), @@ -61,8 +59,7 @@ def mk_es(t): "hidden": bool(t.hidden), "deleted": bool(t.deleted), "has_torrent": t.has_torrent, - # XXX last_updated isn't initialized - "stats_last_updated": t.stats.last_updated or t.created_time, + # Stats "download_count": t.stats.download_count, "leech_count": t.stats.leech_count, "seed_count": t.stats.seed_count, diff --git a/nyaa/routes.py b/nyaa/routes.py index 3e87a2a..758635a 100644 --- a/nyaa/routes.py +++ b/nyaa/routes.py @@ -148,7 +148,7 @@ def search(term='', user=None, sort='id', order='desc', category='0_0', quality_ s = Search(using=es_client, index='nyaav2') if term: query = db.session.query(models.TorrentNameSearch) - s = s.query("query_string", default_field="display_name", default_operator="AND", query=term) + s = s.query("simple_query_string", analyzer="my_search_analyzer", default_operator="AND", query=term) else: query = models.Torrent.query diff --git a/requirements.txt b/requirements.txt index dbf234d..af89eab 100644 --- a/requirements.txt +++ b/requirements.txt @@ -33,24 +33,6 @@ webassets==0.12.1 Werkzeug==0.12.1 WTForms==2.1 ## The following requirements were added by pip freeze: -decorator==4.0.11 elasticsearch==5.3.0 elasticsearch-dsl==5.2.0 -ipython==6.0.0 -ipython-genutils==0.2.0 -jedi==0.10.2 -mysql-replication==0.13 -pexpect==4.2.1 -pickleshare==0.7.4 -pkg-resources==0.0.0 -progressbar2==3.20.0 -prompt-toolkit==1.0.14 -ptyprocess==0.5.1 -Pygments==2.2.0 -PyMySQL==0.7.11 -python-dateutil==2.6.0 -python-utils==2.1.0 -simplegeneric==0.8.1 -traitlets==4.3.2 -urllib3==1.21.1 -wcwidth==0.1.7 +progressbar2==3.20.0 \ No newline at end of file