some more elasticsearch work, including index mapping and analyzer

2024-12-22 10:50:07 +00:00 · 2017-05-15 11:14:01 -07:00 · 2017-05-15 11:14:01 -07:00 · c2c547e786
parent 32b9170a81
commit c2c547e786
5 changed files with 97 additions and 24 deletions
--- a/create_es.sh
+++ b/create_es.sh
@ -0,0 +1,3 @@
 #!/usr/bin/env bash
 curl -v -XPUT 'localhost:9200/nyaav2?pretty' -H"Content-Type: application/yaml" --data-binary @es_mapping.yml
--- a/es_mapping.yml
+++ b/es_mapping.yml
@ -0,0 +1,91 @@
 ---
 # CREATE DTABASE/TABLE equivalent for elasticsearch, in yaml
 # fo inline comments.
 settings:
  analysis:
    analyzer:
      my_search_analyzer:
        type: custom
        tokenizer: standard
        char_filter:
          - my_char_filter
        filter:
          - standard
          - lowercase
      my_index_analyzer:
        type: custom
        tokenizer: standard
        char_filter:
          - my_char_filter
        filter:
          - lowercase
          - my_ngram
    filter:
      my_ngram:
        type: edgeNGram
        min_gram: 1
        max_gram: 15
    char_filter:
      my_char_filter:
        type: mapping
        mappings: ["-=>_", "!=>_"]
  index:
    # we're running a single es node, so no sharding necessary,
    # plus replicas don't really help either.
    number_of_shards: 1
    number_of_replicas : 0
    mapper:
      # disable elasticsearch's "helpful" autoschema
      dynamic: false
    # since we disabled the _all field, default query the
    # name of the torrent.
    query:
      default_field: display_name
 mappings:
  torrent:
    # don't want everything concatenated
    _all:
      enabled: false
    properties:
      id:
        type: long
      display_name:
        # TODO could do a fancier tokenizer here to parse out the
        # the scene convention of stuff in brackets, plus stuff like k-on
        type: text
        analyzer: my_index_analyzer
        fielddata: true
      created_time:
        type: date
        # Only in the ES index for generating magnet links
      info_hash:
        enabled: false
      filesize:
        type: long
      anonymous:
        type: boolean
      trusted:
        type: boolean
      remake:
        type: boolean
      complete:
        type: boolean
      hidden:
        type: boolean
      deleted:
        type: boolean
      has_torrent:
        type: boolean
      download_count:
        type: long
      leech_count:
        type: long
      seed_count:
        type: long
      # these ids are really only for filtering, thus keyword
      uploader_id:
        type: keyword
      main_category_id:
        type: keyword
      sub_category_id:
        type: keyword
--- a/import_to_es.py
+++ b/import_to_es.py
@ -41,8 +41,6 @@ def mk_es(t):
            "id": t.id,
            "display_name": t.display_name,
            "created_time": t.created_time,
            "updated_time": t.updated_time,
            "description": t.description,
            # not analyzed but included so we can render magnet links
            # without querying sql again.
            "info_hash": t.info_hash.hex(),
@ -61,8 +59,7 @@ def mk_es(t):
            "hidden": bool(t.hidden),
            "deleted": bool(t.deleted),
            "has_torrent": t.has_torrent,
-            # XXX last_updated isn't initialized
+            # Stats
            "stats_last_updated": t.stats.last_updated or t.created_time,
            "download_count": t.stats.download_count,
            "leech_count": t.stats.leech_count,
            "seed_count": t.stats.seed_count,
--- a/nyaa/routes.py
+++ b/nyaa/routes.py
@ -148,7 +148,7 @@ def search(term='', user=None, sort='id', order='desc', category='0_0', quality_
    s = Search(using=es_client, index='nyaav2')
    if term:
        query = db.session.query(models.TorrentNameSearch)
-        s = s.query("query_string", default_field="display_name", default_operator="AND", query=term)
+        s = s.query("simple_query_string", analyzer="my_search_analyzer", default_operator="AND", query=term)
    else:
        query = models.Torrent.query
--- a/requirements.txt
+++ b/requirements.txt
@ -33,24 +33,6 @@ webassets==0.12.1
 Werkzeug==0.12.1
 WTForms==2.1
 ## The following requirements were added by pip freeze:
 decorator==4.0.11
 elasticsearch==5.3.0
 elasticsearch-dsl==5.2.0
 ipython==6.0.0
 ipython-genutils==0.2.0
 jedi==0.10.2
 mysql-replication==0.13
 pexpect==4.2.1
 pickleshare==0.7.4
 pkg-resources==0.0.0
 progressbar2==3.20.0
 prompt-toolkit==1.0.14
 ptyprocess==0.5.1
 Pygments==2.2.0
 PyMySQL==0.7.11
 python-dateutil==2.6.0
 python-utils==2.1.0
 simplegeneric==0.8.1
 traitlets==4.3.2
 urllib3==1.21.1
 wcwidth==0.1.7
		`@ -0,0 +1,3 @@`
							`#!/usr/bin/env bash`

							`curl -v -XPUT 'localhost:9200/nyaav2?pretty' -H"Content-Type: application/yaml" --data-binary @es_mapping.yml`