merged elasticsearch, let's hope this doesn't break shit

2024-12-22 05:29:59 +00:00 · 2017-05-16 01:28:55 -07:00 · 2017-05-16 01:28:55 -07:00 · 00c768c722
parent b7a2aa78a4 1d0177480e
commit 00c768c722
14 changed files with 969 additions and 177 deletions
--- a/README.md
+++ b/README.md
@ -44,5 +44,43 @@
 - Start the dev server with `python run.py`
 - Deactivate `source deactivate`
 # Enabling ElasticSearch
 ## Basics
 - Install jdk `sudo apt-get install openjdk-8-jdk`
 - Install elasticsearch https://www.elastic.co/guide/en/elasticsearch/reference/current/deb.html
 - `sudo systemctl enable elasticsearch.service`
 - `sudo systemctl start elasticsearch.service`
 - Run `curl -XGET 'localhost:9200'` and make sure ES is running
 - Optional: install Kabana as a search frontend for ES
 ## Enable MySQL Binlogging
 - Add the `[mariadb]` bin-log section to my.cnf and reload mysql server
 - Connect to mysql
 - `SHOW VARIABLES LIKE 'binlog_format';`
    - Make sure it shows ROW
 - Connect to root user
 - `GRANT REPLICATION SLAVE ON *.* TO 'test'@'localhost';` where test is the user you will be running `sync_es.py` with
 ## Setting up ES
 - Run `./create_es.sh` and this creates two indicies: `nyaa` and `sukebei`
 - The output should show `akncolwedged: true` twice
 - The safest bet is to disable the webapp here to ensure there's no database writes
 - Run `python import_to_es.py` with `SITE_FLAVOR` set to `nyaa`
 - Run `python import_to_es.py` with `SITE_FLAVOR` set to `sukebei`
 - These will take some time to run as it's indexing
 ## Setting up sync_es.py
 - Sync_es.py keeps the ElasticSearch index updated by reading the BinLog
 - Configure the MySQL options with the user where you granted the REPLICATION permissions
 - Connect to MySQL, run `SHOW MASTER STATUS;`.
 - Copy the output to `/var/lib/sync_es_position.json` with the contents `{"log_file": "FILE", "log_pos": POSITION}` and replace FILENAME with File (something like master1-bin.000002) in the SQL output and POSITION (something like 892528513) with Position
 - Set up `sync_es.py` as a service and run it, preferably as the system/root
 - Make sure `sync_es.py` runs within venv with the right dependencies
 ## Good to go!
 - After that, enable the `USE_ELASTIC_SEARCH` flag and restart the webapp and you're good to go
 ## Code Quality:
 - Remember to follow PEP8 style guidelines and run `./lint.sh` before committing.
--- a/config.example.py
+++ b/config.example.py
@ -33,8 +33,6 @@ MAIL_FROM_ADDRESS = '***'
 SMTP_USERNAME = '***'
 SMTP_PASSWORD = '***'
 RESULTS_PER_PAGE = 75
 # What the site identifies itself as.
 SITE_NAME = 'Nyaa'
@ -49,3 +47,14 @@ ENFORCE_MAIN_ANNOUNCE_URL = False
 MAIN_ANNOUNCE_URL = ''
 BACKUP_TORRENT_FOLDER = 'torrents'
 #
 # Search Options
 #
 # Max ES search results, do not set over 10000
 RESULTS_PER_PAGE = 75
 USE_ELASTIC_SEARCH = False
 ENABLE_ELASTIC_SEARCH_HIGHLIGHT = False
 ES_MAX_SEARCH_RESULT = 1000
 ES_INDEX_NAME = SITE_FLAVOR  # we create indicies named nyaa or sukebei
--- a/create_es.sh
+++ b/create_es.sh
@ -0,0 +1,5 @@
 #!/usr/bin/env bash
 # create indicies named "nyaa" and "sukebei", these are hardcoded
 curl -v -XPUT 'localhost:9200/nyaa?pretty' -H"Content-Type: application/yaml" --data-binary @es_mapping.yml
 curl -v -XPUT 'localhost:9200/sukebei?pretty' -H"Content-Type: application/yaml" --data-binary @es_mapping.yml
--- a/es_mapping.yml
+++ b/es_mapping.yml
@ -0,0 +1,91 @@
 ---
 # CREATE DTABASE/TABLE equivalent for elasticsearch, in yaml
 # fo inline comments.
 settings:
  analysis:
    analyzer:
      my_search_analyzer:
        type: custom
        tokenizer: standard
        char_filter:
          - my_char_filter
        filter:
          - standard
          - lowercase
      my_index_analyzer:
        type: custom
        tokenizer: standard
        char_filter:
          - my_char_filter
        filter:
          - lowercase
          - my_ngram
    filter:
      my_ngram:
        type: edgeNGram
        min_gram: 1
        max_gram: 15
    char_filter:
      my_char_filter:
        type: mapping
        mappings: ["-=>_", "!=>_"]
  index:
    # we're running a single es node, so no sharding necessary,
    # plus replicas don't really help either.
    number_of_shards: 1
    number_of_replicas : 0
    mapper:
      # disable elasticsearch's "helpful" autoschema
      dynamic: false
    # since we disabled the _all field, default query the
    # name of the torrent.
    query:
      default_field: display_name
 mappings:
  torrent:
    # don't want everything concatenated
    _all:
      enabled: false
    properties:
      id:
        type: long
      display_name:
        # TODO could do a fancier tokenizer here to parse out the
        # the scene convention of stuff in brackets, plus stuff like k-on
        type: text
        analyzer: my_index_analyzer
        fielddata: true
      created_time:
        type: date
        # Only in the ES index for generating magnet links
      info_hash:
        enabled: false
      filesize:
        type: long
      anonymous:
        type: boolean
      trusted:
        type: boolean
      remake:
        type: boolean
      complete:
        type: boolean
      hidden:
        type: boolean
      deleted:
        type: boolean
      has_torrent:
        type: boolean
      download_count:
        type: long
      leech_count:
        type: long
      seed_count:
        type: long
      # these ids are really only for filtering, thus keyword
      uploader_id:
        type: keyword
      main_category_id:
        type: keyword
      sub_category_id:
        type: keyword
--- a/import_to_es.py
+++ b/import_to_es.py
@ -0,0 +1,92 @@
 #!/usr/bin/env python
 """
 Bulk load torents from mysql into elasticsearch `nyaav2` index,
 which is assumed to already exist.
 This is a one-shot deal, so you'd either need to complement it
 with a cron job or some binlog-reading thing (TODO)
 """
 from nyaa import app
 from nyaa.models import Torrent
 from elasticsearch import Elasticsearch
 from elasticsearch import helpers
 import progressbar
 import sys
 bar = progressbar.ProgressBar(
        max_value=Torrent.query.count(),
        widgets=[
            progressbar.SimpleProgress(),
            ' [', progressbar.Timer(), '] ',
            progressbar.Bar(),
            ' (', progressbar.ETA(), ') ',
            ])
 es = Elasticsearch()
 # turn into thing that elasticsearch indexes. We flatten in
 # the stats (seeders/leechers) so we can order by them in es naturally.
 # we _don't_ dereference uploader_id to the user's display name however,
 # instead doing that at query time. I _think_ this is right because
 # we don't want to reindex all the user's torrents just because they
 # changed their name, and we don't really want to FTS search on the user anyway.
 # Maybe it's more convenient to derefence though.
 def mk_es(t):
    return {
        "_id": t.id,
        "_type": "torrent",
        "_index": app.config['ES_INDEX_NAME'],
        "_source": {
            # we're also indexing the id as a number so you can
            # order by it. seems like this is just equivalent to
            # order by created_time, but oh well
            "id": t.id,
            "display_name": t.display_name,
            "created_time": t.created_time,
            # not analyzed but included so we can render magnet links
            # without querying sql again.
            "info_hash": t.info_hash.hex(),
            "filesize": t.filesize,
            "uploader_id": t.uploader_id,
            "main_category_id": t.main_category_id,
            "sub_category_id": t.sub_category_id,
            # XXX all the bitflags are numbers
            "anonymous": bool(t.anonymous),
            "trusted": bool(t.trusted),
            "remake": bool(t.remake),
            "complete": bool(t.complete),
            # TODO instead of indexing and filtering later
            # could delete from es entirely. Probably won't matter
            # for at least a few months.
            "hidden": bool(t.hidden),
            "deleted": bool(t.deleted),
            "has_torrent": t.has_torrent,
            # Stats
            "download_count": t.stats.download_count,
            "leech_count": t.stats.leech_count,
            "seed_count": t.stats.seed_count,
        }
    }
 # page through an sqlalchemy query, like the per_fetch but
 # doesn't break the eager joins its doing against the stats table.
 # annoying that this isn't built in somehow.
 def page_query(query, limit=sys.maxsize, batch_size=10000):
    start = 0
    while True:
        # XXX very inelegant way to do this, i'm confus
        stop = min(limit, start + batch_size)
        if stop == start:
            break
        things = query.slice(start, stop)
        if not things:
            break
        had_things = False
        for thing in things:
            had_things = True
            yield(thing)
        if not had_things or stop == limit:
            break
        bar.update(start)
        start = min(limit, start + batch_size)
 helpers.bulk(es, (mk_es(t) for t in page_query(Torrent.query)), chunk_size=10000)
--- a/my.cnf
+++ b/my.cnf
@ -4,3 +4,9 @@ ft_min_word_len=2
 innodb_ft_cache_size = 80000000
 innodb_ft_total_cache_size = 1600000000
 max_allowed_packet = 100M
 [mariadb]
 log-bin
 server_id=1
 log-basename=master1
 binlog-format = row
--- a/nyaa/routes.py
+++ b/nyaa/routes.py
@ -6,18 +6,16 @@ from nyaa import bencode, utils
 from nyaa import torrents
 from nyaa import backend
 from nyaa import api_handler
 from nyaa.search import search_elastic, search_db
 import config
 import json
 import re
 from datetime import datetime, timedelta
 import ipaddress
 import os.path
 import base64
 from urllib.parse import quote
-import sqlalchemy_fulltext.modes as FullTextMode
+import math
 from sqlalchemy_fulltext import FullTextSearch
 import shlex
 from werkzeug import url_encode
 from itsdangerous import URLSafeSerializer, BadSignature
@ -27,7 +25,14 @@ from email.mime.multipart import MIMEMultipart
 from email.mime.text import MIMEText
 from email.utils import formatdate
 from flask_paginate import Pagination
 DEBUG_API = False
 DEFAULT_MAX_SEARCH_RESULT = 1000
 DEFAULT_PER_PAGE = 75
 SERACH_PAGINATE_DISPLAY_MSG = '''Displaying results {start}-{end} out of {total} results.<br>
                                 Please refine your search results if you can't find what you were looking for.'''
 def redirect_url():
@ -48,144 +53,13 @@ def modify_query(**new_values):
    return '{}?{}'.format(flask.request.path, url_encode(args))
@app.template_global()
 def filter_truthy(input_list):
    ''' Jinja2 can't into list comprehension so this is for
        the search_results.html template '''
    return [item for item in input_list if item]
 def search(term='', user=None, sort='id', order='desc', category='0_0', quality_filter='0', page=1, rss=False, admin=False):
    sort_keys = {
        'id': models.Torrent.id,
        'size': models.Torrent.filesize,
        'name': models.Torrent.display_name,
        'seeders': models.Statistic.seed_count,
        'leechers': models.Statistic.leech_count,
        'downloads': models.Statistic.download_count
    }
    sort_ = sort.lower()
    if sort_ not in sort_keys:
        flask.abort(400)
    sort = sort_keys[sort]
    order_keys = {
        'desc': 'desc',
        'asc': 'asc'
    }
    order_ = order.lower()
    if order_ not in order_keys:
        flask.abort(400)
    filter_keys = {
        '0': None,
        '1': (models.TorrentFlags.REMAKE, False),
        '2': (models.TorrentFlags.TRUSTED, True),
        '3': (models.TorrentFlags.COMPLETE, True)
    }
    sentinel = object()
    filter_tuple = filter_keys.get(quality_filter.lower(), sentinel)
    if filter_tuple is sentinel:
        flask.abort(400)
    if user:
        user = models.User.by_id(user)
        if not user:
            flask.abort(404)
        user = user.id
    main_category = None
    sub_category = None
    main_cat_id = 0
    sub_cat_id = 0
    if category:
        cat_match = re.match(r'^(\d+)_(\d+)$', category)
        if not cat_match:
            flask.abort(400)
        main_cat_id = int(cat_match.group(1))
        sub_cat_id = int(cat_match.group(2))
        if main_cat_id > 0:
            if sub_cat_id > 0:
                sub_category = models.SubCategory.by_category_ids(main_cat_id, sub_cat_id)
            else:
                main_category = models.MainCategory.by_id(main_cat_id)
            if not category:
                flask.abort(400)
    # Force sort by id desc if rss
    if rss:
        sort = sort_keys['id']
        order = 'desc'
    same_user = False
    if flask.g.user:
        same_user = flask.g.user.id == user
    if term:
        query = db.session.query(models.TorrentNameSearch)
    else:
        query = models.Torrent.query
    # User view (/user/username)
    if user:
        query = query.filter(models.Torrent.uploader_id == user)
        if not admin:
            # Hide all DELETED torrents if regular user
            query = query.filter(models.Torrent.flags.op('&')(int(models.TorrentFlags.DELETED)).is_(False))
            # If logged in user is not the same as the user being viewed, show only torrents that aren't hidden or anonymous
            # If logged in user is the same as the user being viewed, show all torrents including hidden and anonymous ones
            # On RSS pages in user view, show only torrents that aren't hidden or anonymous no matter what
            if not same_user or rss:
                query = query.filter(models.Torrent.flags.op('&')(int(models.TorrentFlags.HIDDEN |
                                                                      models.TorrentFlags.ANONYMOUS)).is_(False))
    # General view (homepage, general search view)
    else:
        if not admin:
            # Hide all DELETED torrents if regular user
            query = query.filter(models.Torrent.flags.op('&')(int(models.TorrentFlags.DELETED)).is_(False))
            # If logged in, show all torrents that aren't hidden unless they belong to you
            # On RSS pages, show all public torrents and nothing more.
            if flask.g.user and not rss:
                query = query.filter((models.Torrent.flags.op('&')(int(models.TorrentFlags.HIDDEN)).is_(False)) |
                                     (models.Torrent.uploader_id == flask.g.user.id))
            # Otherwise, show all torrents that aren't hidden
            else:
                query = query.filter(models.Torrent.flags.op('&')(int(models.TorrentFlags.HIDDEN)).is_(False))
    if main_category:
        query = query.filter(models.Torrent.main_category_id == main_cat_id)
    elif sub_category:
        query = query.filter((models.Torrent.main_category_id == main_cat_id) &
                             (models.Torrent.sub_category_id == sub_cat_id))
    if filter_tuple:
        query = query.filter(models.Torrent.flags.op('&')(int(filter_tuple[0])).is_(filter_tuple[1]))
    if term:
        for item in shlex.split(term, posix=False):
            if len(item) >= 2:
                query = query.filter(FullTextSearch(
                    item, models.TorrentNameSearch, FullTextMode.NATURAL))
    # Sort and order
    if sort.class_ != models.Torrent:
        query = query.join(sort.class_)
    query = query.order_by(getattr(sort, order)())
    if rss:
        query = query.limit(app.config['RESULTS_PER_PAGE'])
    else:
        query = query.paginate_faste(page, per_page=app.config['RESULTS_PER_PAGE'], step=5)
    return query
@app.errorhandler(404)
 def not_found(error):
@ -203,7 +77,6 @@ def before_request():
        flask.g.user = user
        if not 'timeout' in flask.session or flask.session['timeout'] < datetime.now():
            print("hio")
            flask.session['timeout'] = datetime.now() + timedelta(days=7)
            flask.session.permanent = True
            flask.session.modified = True
@ -225,6 +98,18 @@ def _generate_query_string(term, category, filter, user):
    return params
@app.template_filter('utc_time')
 def get_utc_timestamp(datetime_str):
    ''' Returns a UTC POSIX timestamp, as seconds '''
    UTC_EPOCH = datetime.utcfromtimestamp(0)
    return int((datetime.strptime(datetime_str, '%Y-%m-%dT%H:%M:%S') - UTC_EPOCH).total_seconds())
@app.template_filter('display_time')
 def get_display_time(datetime_str):
    return datetime.strptime(datetime_str, '%Y-%m-%dT%H:%M:%S').strftime('%Y-%m-%d %H:%M')
@app.route('/rss', defaults={'rss': True})
@app.route('/', defaults={'rss': False})
 def home(rss):
@ -241,6 +126,10 @@ def home(rss):
    if page:
        page = int(page)
    per_page = app.config.get('RESULTS_PER_PAGE')
    if not per_page:
        per_page = DEFAULT_PER_PAGE
    user_id = None
    if user_name:
        user = models.User.by_username(user_name)
@ -249,30 +138,72 @@ def home(rss):
        user_id = user.id
    query_args = {
        'term': term or '',
        'user': user_id,
        'sort': sort or 'id',
        'order': order or 'desc',
        'category': category or '0_0',
        'quality_filter': quality_filter or '0',
        'page': page or 1,
-        'rss': rss
+        'rss': rss,
        'per_page': per_page
    }
-    # God mode
+    if flask.g.user:
-    if flask.g.user and flask.g.user.is_admin:
+        query_args['logged_in_user'] = flask.g.user
-        query_args['admin'] = True
+        if flask.g.user.is_admin:  # God mode
            query_args['admin'] = True
-    query = search(**query_args)
+    # If searching, we get results from elastic search
    use_elastic = app.config.get('USE_ELASTIC_SEARCH')
    if use_elastic and term:
        query_args['term'] = term
-    if rss:
+        max_search_results = app.config.get('ES_MAX_SEARCH_RESULT')
-        return render_rss('/', query)
+        if not max_search_results:
            max_search_results = DEFAULT_MAX_SEARCH_RESULT
        max_page = min(query_args['page'], int(math.ceil(max_search_results / float(per_page)))) # Only allow up to (max_search_results / page) pages 
        query_args['page'] = max_page
        query_args['max_search_results'] = max_search_results
        query_results = search_elastic(**query_args)
        if rss:
            return render_rss('/', query_results, use_elastic=True)
        else:
            rss_query_string = _generate_query_string(term, category, quality_filter, user_name)
            max_results = min(max_search_results, query_results['hits']['total'])
            # change p= argument to whatever you change page_parameter to or pagination breaks
            pagination = Pagination(p=query_args['page'], per_page=per_page,
                                    total=max_results, bs_version=3, page_parameter='p',
                                    display_msg=SERACH_PAGINATE_DISPLAY_MSG)
            return flask.render_template('home.html',
                                         use_elastic=True,
                                         pagination=pagination,
                                         torrent_query=query_results,
                                         search=query_args,
                                         rss_filter=rss_query_string)
    else:
-        rss_query_string = _generate_query_string(term, category, quality_filter, user_name)
+        # If ES is enabled, default to db search for browsing
-        return flask.render_template('home.html',
+        if use_elastic:
-                                     torrent_query=query,
+            query_args['term'] = ''
-                                     search=query_args,
+        else: # Otherwise, use db search for everything
-                                     rss_filter=rss_query_string)
+            query_args['term'] = term or ''
        query = search_db(**query_args)
        if rss:
            return render_rss('/', query, use_elastic=False)
        else:
            rss_query_string = _generate_query_string(term, category, quality_filter, user_name)
            # Use elastic is always false here because we only hit this section
            # if we're browsing without a search term (which means we default to DB)
            # or if ES is disabled
            return flask.render_template('home.html',
                                         use_elastic=False,
                                         torrent_query=query,
                                         search=query_args,
                                         rss_filter=rss_query_string)
@app.route('/user/<user_name>')
@ -291,6 +222,10 @@ def view_user(user_name):
    if page:
        page = int(page)
    per_page = app.config.get('RESULTS_PER_PAGE')
    if not per_page:
        per_page = DEFAULT_PER_PAGE
    query_args = {
        'term': term or '',
        'user': user.id,
@ -299,40 +234,82 @@ def view_user(user_name):
        'category': category or '0_0',
        'quality_filter': quality_filter or '0',
        'page': page or 1,
-        'rss': False
+        'rss': False,
        'per_page': per_page
    }
-    # God mode
+    if flask.g.user:
-    if flask.g.user and flask.g.user.is_admin:
+        query_args['logged_in_user'] = flask.g.user
-        query_args['admin'] = True
+        if flask.g.user.is_admin:  # God mode
-
+            query_args['admin'] = True
    query = search(**query_args)
    # Use elastic search for term searching
    rss_query_string = _generate_query_string(term, category, quality_filter, user_name)
-    return flask.render_template('user.html',
+    use_elastic = app.config.get('USE_ELASTIC_SEARCH')
-                                 torrent_query=query,
+    if use_elastic and term:
-                                 search=query_args,
+        query_args['term'] = term
-                                 user=user,
+
-                                 user_page=True,
+        max_search_results = app.config.get('ES_MAX_SEARCH_RESULT')
-                                 rss_filter=rss_query_string)
+        if not max_search_results:
            max_search_results = DEFAULT_MAX_SEARCH_RESULT
        max_page = min(query_args['page'], int(math.ceil(max_search_results / float(per_page)))) # Only allow up to (max_search_results / page) pages 
        query_args['page'] = max_page
        query_args['max_search_results'] = max_search_results
        query_results = search_elastic(**query_args)
        max_results = min(max_search_results, query_results['hits']['total'])
        # change p= argument to whatever you change page_parameter to or pagination breaks
        pagination = Pagination(p=query_args['page'], per_page=per_page,
                                total=max_results, bs_version=3, page_parameter='p',
                                display_msg=SERACH_PAGINATE_DISPLAY_MSG)
        return flask.render_template('user.html',
                                     use_elastic=True,
                                     pagination=pagination,
                                     torrent_query=query_results,
                                     search=query_args,
                                     user=user,
                                     user_page=True,
                                     rss_filter=rss_query_string)
    # Similar logic as home page
    else:
        if use_elastic:
            query_args['term'] = ''
        else:
            query_args['term'] = term or ''
        query = search_db(**query_args)
        return flask.render_template('user.html',
                                     use_elastic=False,
                                     torrent_query=query,
                                     search=query_args,
                                     user=user,
                                     user_page=True,
                                     rss_filter=rss_query_string)
@app.template_filter('rfc822')
 def _jinja2_filter_rfc822(date, fmt=None):
    return formatdate(float(date.strftime('%s')))
@app.template_filter('rfc822_es')
 def _jinja2_filter_rfc822(datestr, fmt=None):
    return formatdate(float(datetime.strptime(datestr, '%Y-%m-%dT%H:%M:%S').strftime('%s')))
-def render_rss(label, query):
+
 def render_rss(label, query, use_elastic):
    rss_xml = flask.render_template('rss.xml',
                                    use_elastic=use_elastic,
                                    term=label,
                                    site_url=flask.request.url_root,
-                                    query=query)
+                                    torrent_query=query)
    response = flask.make_response(rss_xml)
    response.headers['Content-Type'] = 'application/xml'
    return response
-#@app.route('/about', methods=['GET'])
+# @app.route('/about', methods=['GET'])
 # def about():
 #    return flask.render_template('about.html')
--- a/nyaa/search.py
+++ b/nyaa/search.py
@ -0,0 +1,317 @@
 import flask
 import re
 import math
 import json
 import shlex
 from nyaa import app, db
 from nyaa import models
 import sqlalchemy_fulltext.modes as FullTextMode
 from sqlalchemy_fulltext import FullTextSearch
 from elasticsearch import Elasticsearch
 from elasticsearch_dsl import Search, Q
 def search_elastic(term='', user=None, sort='id', order='desc',
                   category='0_0', quality_filter='0', page=1,
                   rss=False, admin=False, logged_in_user=None,
                   per_page=75, max_search_results=1000):
    # This function can easily be memcached now
    es_client = Elasticsearch()
    es_sort_keys = {
        'id': 'id',
        'size': 'filesize',
        # 'name': 'display_name',  # This is slow and buggy
        'seeders': 'seed_count',
        'leechers': 'leech_count',
        'downloads': 'download_count'
    }
    sort_ = sort.lower()
    if sort_ not in es_sort_keys:
        flask.abort(400)
    es_sort = es_sort_keys[sort]
    order_keys = {
        'desc': 'desc',
        'asc': 'asc'
    }
    order_ = order.lower()
    if order_ not in order_keys:
        flask.abort(400)
    # Only allow ID, desc if RSS
    if rss:
        sort = es_sort_keys['id']
        order = 'desc'
    # funky, es sort is default asc, prefixed by '-' if desc
    if 'desc' == order:
        es_sort = '-' + es_sort
    # Quality filter
    quality_keys = [
        '0',  # Show all
        '1',  # No remakes
        '2',  # Only trusted
        '3'   # Only completed
    ]
    if quality_filter.lower() not in quality_keys:
        flask.abort(400)
    quality_filter = int(quality_filter)
    # Category filter
    main_category = None
    sub_category = None
    main_cat_id = 0
    sub_cat_id = 0
    if category:
        cat_match = re.match(r'^(\d+)_(\d+)$', category)
        if not cat_match:
            flask.abort(400)
        main_cat_id = int(cat_match.group(1))
        sub_cat_id = int(cat_match.group(2))
        if main_cat_id > 0:
            if sub_cat_id > 0:
                sub_category = models.SubCategory.by_category_ids(main_cat_id, sub_cat_id)
                if not sub_category:
                    flask.abort(400)
            else:
                main_category = models.MainCategory.by_id(main_cat_id)
                if not main_category:
                    flask.abort(400)
    # This might be useless since we validate users
    # before coming into this method, but just to be safe...
    if user:
        user = models.User.by_id(user)
        if not user:
            flask.abort(404)
        user = user.id
    same_user = False
    if logged_in_user:
        same_user = user == logged_in_user.id
    s = Search(using=es_client, index=app.config.get('ES_INDEX_NAME'))  # todo, sukebei prefix
    # Apply search term
    if term:
        s = s.query('simple_query_string',
                    analyzer='my_search_analyzer',
                    default_operator="AND",
                    query=term)
    # User view (/user/username)
    if user:
        s = s.filter('term', uploader_id=user)
        if not admin:
            # Hide all DELETED torrents if regular user
            s = s.filter('term', deleted=False)
            # If logged in user is not the same as the user being viewed,
            # show only torrents that aren't hidden or anonymous.
            #
            # If logged in user is the same as the user being viewed,
            # show all torrents including hidden and anonymous ones.
            #
            # On RSS pages in user view, show only torrents that
            # aren't hidden or anonymous no matter what
            if not same_user or rss:
                s = s.filter('term', hidden=False)
                s = s.filter('term', anonymous=False)
    # General view (homepage, general search view)
    else:
        if not admin:
            # Hide all DELETED torrents if regular user
            s = s.filter('term', deleted=False)
            # If logged in, show all torrents that aren't hidden unless they belong to you
            # On RSS pages, show all public torrents and nothing more.
            if logged_in_user and not rss:
                hiddenFilter = Q('term', hidden=False)
                userFilter = Q('term', uploader_id=logged_in_user.id)
                combinedFilter = hiddenFilter | userFilter
                s = s.filter('bool', filter=[combinedFilter])
            else:
                s = s.filter('term', hidden=False)
    if main_category:
        s = s.filter('term', main_category_id=main_cat_id)
    elif sub_category:
        s = s.filter('term', main_category_id=main_cat_id)
        s = s.filter('term', sub_category_id=sub_cat_id)
    if quality_filter == 0:
        pass
    elif quality_filter == 1:
        s = s.filter('term', remake=False)
    elif quality_filter == 2:
        s = s.filter('term', trusted=True)
    elif quality_filter == 3:
        s = s.filter('term', complete=True)
    # Apply sort
    s = s.sort(es_sort)
    # Only show first RESULTS_PER_PAGE items for RSS
    if rss:
        s = s[0:per_page]
    else:
        max_page = min(page, int(math.ceil(max_search_results / float(per_page))))
        from_idx = (max_page-1)*per_page
        to_idx = min(max_search_results, max_page*per_page)
        s = s[from_idx:to_idx]
    highlight = app.config.get('ENABLE_ELASTIC_SEARCH_HIGHLIGHT')
    if highlight:
        s = s.highlight_options(tags_schema='styled')
        s = s.highlight("display_name")
    # Return query, uncomment print line to debug query
    # from pprint import pprint
    # print(json.dumps(s.to_dict()))
    return s.execute()
 def search_db(term='', user=None, sort='id', order='desc', category='0_0',
              quality_filter='0', page=1, rss=False, admin=False,
              logged_in_user=None, per_page=75):
    sort_keys = {
        'id': models.Torrent.id,
        'size': models.Torrent.filesize,
        # 'name': models.Torrent.display_name, # Disable this because we disabled this in search_elastic, for the sake of consistency
        'seeders': models.Statistic.seed_count,
        'leechers': models.Statistic.leech_count,
        'downloads': models.Statistic.download_count
    }
    sort_ = sort.lower()
    if sort_ not in sort_keys:
        flask.abort(400)
    sort = sort_keys[sort]
    order_keys = {
        'desc': 'desc',
        'asc': 'asc'
    }
    order_ = order.lower()
    if order_ not in order_keys:
        flask.abort(400)
    filter_keys = {
        '0': None,
        '1': (models.TorrentFlags.REMAKE, False),
        '2': (models.TorrentFlags.TRUSTED, True),
        '3': (models.TorrentFlags.COMPLETE, True)
    }
    sentinel = object()
    filter_tuple = filter_keys.get(quality_filter.lower(), sentinel)
    if filter_tuple is sentinel:
        flask.abort(400)
    if user:
        user = models.User.by_id(user)
        if not user:
            flask.abort(404)
        user = user.id
    main_category = None
    sub_category = None
    main_cat_id = 0
    sub_cat_id = 0
    if category:
        cat_match = re.match(r'^(\d+)_(\d+)$', category)
        if not cat_match:
            flask.abort(400)
        main_cat_id = int(cat_match.group(1))
        sub_cat_id = int(cat_match.group(2))
        if main_cat_id > 0:
            if sub_cat_id > 0:
                sub_category = models.SubCategory.by_category_ids(main_cat_id, sub_cat_id)
            else:
                main_category = models.MainCategory.by_id(main_cat_id)
            if not category:
                flask.abort(400)
    # Force sort by id desc if rss
    if rss:
        sort = sort_keys['id']
        order = 'desc'
    same_user = False
    if logged_in_user:
        same_user = logged_in_user.id == user
    if term:
        query = db.session.query(models.TorrentNameSearch)
    else:
        query = models.Torrent.query
    # User view (/user/username)
    if user:
        query = query.filter(models.Torrent.uploader_id == user)
        if not admin:
            # Hide all DELETED torrents if regular user
            query = query.filter(models.Torrent.flags.op('&')(int(models.TorrentFlags.DELETED)).is_(False))
            # If logged in user is not the same as the user being viewed, show only torrents that aren't hidden or anonymous
            # If logged in user is the same as the user being viewed, show all torrents including hidden and anonymous ones
            # On RSS pages in user view, show only torrents that aren't hidden or anonymous no matter what
            if not same_user or rss:
                query = query.filter(models.Torrent.flags.op('&')(int(models.TorrentFlags.HIDDEN |
                                                                      models.TorrentFlags.ANONYMOUS)).is_(False))
    # General view (homepage, general search view)
    else:
        if not admin:
            # Hide all DELETED torrents if regular user
            query = query.filter(models.Torrent.flags.op('&')(int(models.TorrentFlags.DELETED)).is_(False))
            # If logged in, show all torrents that aren't hidden unless they belong to you
            # On RSS pages, show all public torrents and nothing more.
            if logged_in_user and not rss:
                query = query.filter((models.Torrent.flags.op('&')(int(models.TorrentFlags.HIDDEN)).is_(False)) |
                                     (models.Torrent.uploader_id == logged_in_user.id))
            # Otherwise, show all torrents that aren't hidden
            else:
                query = query.filter(models.Torrent.flags.op('&')(int(models.TorrentFlags.HIDDEN)).is_(False))
    if main_category:
        query = query.filter(models.Torrent.main_category_id == main_cat_id)
    elif sub_category:
        query = query.filter((models.Torrent.main_category_id == main_cat_id) &
                             (models.Torrent.sub_category_id == sub_cat_id))
    if filter_tuple:
        query = query.filter(models.Torrent.flags.op('&')(int(filter_tuple[0])).is_(filter_tuple[1]))
    if term:
        for item in shlex.split(term, posix=False):
            if len(item) >= 2:
                query = query.filter(FullTextSearch(
                    item, models.TorrentNameSearch, FullTextMode.NATURAL))
    # Sort and order
    if sort.class_ != models.Torrent:
        query = query.join(sort.class_)
    query = query.order_by(getattr(sort, order)())
    if rss:
        query = query.limit(per_page)
    else:
        query = query.paginate_faste(page, per_page=per_page, step=5)
    return query
--- a/nyaa/static/css/main.css
+++ b/nyaa/static/css/main.css
@ -98,3 +98,13 @@ table.torrent-list thead th.sorting_desc:after {
 	    margin-bottom: 10px;
 	}
 }
 /* elasticsearch term highlight */
 .hlt1 {
    font-style: normal;
    display: inline-block;
    padding: 0 3px;
    border-radius: 3px;
    border: 1px solid rgba(100, 56, 0, 0.8);
    background: rgba(200,127,0,0.3);
 }
--- a/nyaa/templates/rss.xml
+++ b/nyaa/templates/rss.xml
@ -4,20 +4,32 @@
 		<description>RSS Feed for {{ term }}</description>
 		<link>{{ url_for('home', _external=True) }}</link>
 		<atom:link href="{{ url_for('home', page='rss', _external=True) }}" rel="self" type="application/rss+xml" />
-		{% for torrent in query %}
+		{% for torrent in torrent_query %}
 		{% if torrent.has_torrent %}
 		<item>
 			<title>{{ torrent.display_name }}</title>
 			{% if use_elastic %}
 			<link>{{ url_for('download_torrent', torrent_id=torrent.meta.id, _external=True) }}</link>
 			<guid isPermaLink="true">{{ url_for('view_torrent', torrent_id=torrent.meta.id, _external=True) }}</guid>
 			<pubDate>{{ torrent.created_time|rfc822_es }}</pubDate>
 			{% else %}
 			<link>{{ url_for('download_torrent', torrent_id=torrent.id, _external=True) }}</link>
 			<guid isPermaLink="true">{{ url_for('view_torrent', torrent_id=torrent.id, _external=True) }}</guid>
 			<pubDate>{{ torrent.created_time|rfc822 }}</pubDate>
 			{% endif %}			
 		</item>
 		{% else %}
 		<item>
 			<title>{{ torrent.display_name }}</title>
 			{% if use_elastic %}
 			<link>{{ create_magnet_from_info(torrent.display_name, torrent.info_hash) }}</link>
 			<guid isPermaLink="true">{{ url_for('view_torrent', torrent_id=torrent.meta.id, _external=True) }}</guid>
 			<pubDate>{{ torrent.created_time|rfc822_es }}</pubDate>
 			{% else %}
 			<link>{{ torrent.magnet_uri }}</link>
 			<guid isPermaLink="true">{{ url_for('view_torrent', torrent_id=torrent.id, _external=True) }}</guid>
 			<pubDate>{{ torrent.created_time|rfc822 }}</pubDate>
 			{% endif %}
 		</item>
 		{% endif %}
 		{% endfor %}
--- a/nyaa/templates/search_results.html
+++ b/nyaa/templates/search_results.html
@ -8,7 +8,7 @@
 	{{ caller() }}
 </th>
 {% endmacro %}
-{% if torrent_query.items %}
+{% if (use_elastic and torrent_query.hits.total > 0) or (torrent_query.items) %}
 <div class="table-responsive">
 	<table class="table table-bordered table-hover table-striped torrent-list">
 		<thead>
@ -16,7 +16,7 @@
 				{% call render_column_header("hdr-category", "width:80px;", center_text=True) %}
 					<div>Category</div>
 				{% endcall %}
-				{% call render_column_header("hdr-name", "width:auto;", sort_key="name") %}
+				{% call render_column_header("hdr-name", "width:auto;") %}
 					<div>Name</div>
 				{% endcall %}
 				{% call render_column_header("hdr-link", "width:70px;", center_text=True) %}
@ -45,27 +45,51 @@
 			</tr>
 		</thead>
 		<tbody>
-			{% for torrent in torrent_query.items %}
+			{% set torrents = torrent_query if use_elastic else torrent_query.items %}
 			{% for torrent in torrents %}
 			<tr class="{% if torrent.deleted %}deleted{% elif torrent.hidden %}warning{% elif torrent.remake %}danger{% elif torrent.trusted %}success{% else %}default{% endif %}">
-				{% set cat_id = (torrent.main_category.id|string) + '_' + (torrent.sub_category.id|string) %}
+				{% set cat_id = (torrent.main_category_id|string) + '_' + (torrent.sub_category_id|string) if use_elastic else (torrent.main_category.id|string) + '_' + (torrent.sub_category.id|string) %}
 				{% set icon_dir = config.SITE_FLAVOR %}
 				<td style="padding:0 4px;">
 				{% if use_elastic %}
 				<a href="/?c={{ cat_id }}" title="{{ torrent.main_category_id }} - {{ torrent.sub_category_id }}">
 				{% else %}
 				<a href="/?c={{ cat_id }}" title="{{ torrent.main_category.name }} - {{ torrent.sub_category.name }}">
 				{% endif %}
 					<img src="/static/img/icons/{{ icon_dir }}/{{ cat_id }}.png">
 				</a>
 				</td>
 				{% if use_elastic %}
                <td><a href="{{ url_for('view_torrent', torrent_id=torrent.meta.id) }}">{%if "highlight" in torrent.meta %}{{ torrent.meta.highlight.display_name[0] | safe }}{% else %}{{torrent.display_name}}{%endif%}</a></td>
 				{% else %}
 				<td><a href="{{ url_for('view_torrent', torrent_id=torrent.id) }}">{{ torrent.display_name | escape }}</a></td>
 				{% endif %}
 				<td style="white-space: nowrap;text-align: center;">
 					{% if torrent.has_torrent %}<a href="{{ url_for('download_torrent', torrent_id=torrent.id) }}"><i class="fa fa-fw fa-download"></i></a>{% endif %}
 					{% if use_elastic %}
 					<a href="{{ create_magnet_from_info(torrent.display_name, torrent.info_hash) }}"><i class="fa fa-fw fa-magnet"></i></a>
 					{% else %}
 					<a href="{{ torrent.magnet_uri }}"><i class="fa fa-fw fa-magnet"></i></a>
 					{% endif %}
 				</td>
 				<td class="text-center">{{ torrent.filesize | filesizeformat(True) }}</td>
 				{% if use_elastic %}
                <td class="text-center" data-timestamp="{{ torrent.created_time | utc_time }}">{{ torrent.created_time | display_time }}</td>
 				{% else %}
 				<td class="text-center" data-timestamp="{{ torrent.created_utc_timestamp|int }}">{{ torrent.created_time.strftime('%Y-%m-%d %H:%M') }}</td>
 				{% endif %}
 				{% if config.ENABLE_SHOW_STATS %}
 				{% if use_elastic %}
 				<td class="text-center" style="color: green;">{{ torrent.seed_count }}</td>
 				<td class="text-center" style="color: red;">{{ torrent.leech_count }}</td>
 				<td class="text-center">{{ torrent.download_count }}</td>
 				{% else %}
 				<td class="text-center" style="color: green;">{{ torrent.stats.seed_count }}</td>
 				<td class="text-center" style="color: red;">{{ torrent.stats.leech_count }}</td>
 				<td class="text-center">{{ torrent.stats.download_count }}</td>
 				{% endif %}
 				{% endif %}
 			</tr>
 			{% endfor %}
 		</tbody>
@ -76,6 +100,11 @@
 {% endif %}
 <center>
 	{% if use_elastic %}
 	{{ pagination.info }}
 	{{ pagination.links }}
 	{% else %}
 	{% from "bootstrap/pagination.html" import render_pagination %}
 	{{ render_pagination(torrent_query) }}
 	{% endif %}
 </center>
--- a/nyaa/torrents.py
+++ b/nyaa/torrents.py
@ -3,6 +3,7 @@ import base64
 import time
 from urllib.parse import urlencode
 from orderedset import OrderedSet
 from nyaa import app
 from nyaa import bencode
 from nyaa import app
@ -53,10 +54,23 @@ def get_trackers(torrent):
    return list(trackers)
 def get_trackers_magnet():
    trackers = OrderedSet()
    # Our main one first
    main_announce_url = app.config.get('MAIN_ANNOUNCE_URL')
    if main_announce_url:
        trackers.add(main_announce_url)
    # and finally our tracker list
    trackers.update(default_trackers())
    return list(trackers)
 def create_magnet(torrent, max_trackers=5, trackers=None):
    if trackers is None:
-        trackers = get_trackers(torrent)
+        trackers = get_trackers_magnet()
    magnet_parts = [
        ('dn', torrent.display_name)
@ -68,6 +82,24 @@ def create_magnet(torrent, max_trackers=5, trackers=None):
    return 'magnet:?xt=urn:btih:' + b32_info_hash + '&' + urlencode(magnet_parts)
 # For processing ES links
@app.context_processor
 def create_magnet_from_info():
    def _create_magnet_from_info(display_name, info_hash, max_trackers=5, trackers=None):
        if trackers is None:
            trackers = get_trackers_magnet()
        magnet_parts = [
            ('dn', display_name)
        ]
        for tracker in trackers[:max_trackers]:
            magnet_parts.append(('tr', tracker))
        b32_info_hash = base64.b32encode(bytes.fromhex(info_hash)).decode('utf-8')
        return 'magnet:?xt=urn:btih:' + b32_info_hash + '&' + urlencode(magnet_parts)
    return dict(create_magnet_from_info=_create_magnet_from_info)
 def create_default_metadata_base(torrent, trackers=None):
    if trackers is None:
        trackers = get_trackers(torrent)
--- a/requirements.txt
+++ b/requirements.txt
@ -24,11 +24,17 @@ pycodestyle==2.3.1
 pycparser==2.17
 pyparsing==2.2.0
 six==1.10.0
-SQLAlchemy>=1.1.9
+SQLAlchemy==1.1.9
 SQLAlchemy-FullText-Search==0.2.3
-SQLAlchemy-Utils>=0.32.14
+SQLAlchemy-Utils==0.32.14
 uWSGI==2.0.15
 visitor==0.1.3
 webassets==0.12.1
 Werkzeug==0.12.1
 WTForms==2.1
 ## elasticsearch dependencies
 elasticsearch==5.3.0
 elasticsearch-dsl==5.2.0
 progressbar2==3.20.0
 mysql-replication==0.13
 flask-paginate==0.4.5
--- a/sync_es.py
+++ b/sync_es.py
@ -0,0 +1,168 @@
 #!/usr/bin/env python
 """
 stream changes in mysql (on the torrents and statistics table) into
 elasticsearch as they happen on the binlog. This keeps elasticsearch in sync
 with whatever you do to the database, including stuff like admin queries. Also,
 because mysql keeps the binlog around for N days before deleting old stuff, you
 can survive a hiccup of elasticsearch or this script dying and pick up where
 you left off.
 For that "picking up" part, this script depends on one piece of external state:
 its last known binlog filename and position. This is saved off as a JSON file
 to a configurable location on the filesystem periodically. If the file is not
 present then you can initialize it with the values from `SHOW MASTER STATUS`
 from the mysql repl, which will start the sync from current state.
 In the case of catastrophic elasticsearch meltdown where you need to
 reconstruct the index, you'll want to be a bit careful with coordinating
 sync_es and import_to_es scripts. If you run import_to_es first than run
 sync_es against SHOW MASTER STATUS, anything that changed the database between
 when import_to_es and sync_es will be lost. Instead, you can run SHOW MASTER
 STATUS _before_ you run import_to_es. That way you'll definitely pick up any
 changes that happen while the import_to_es script is dumping stuff from the
 database into es, at the expense of redoing a (small) amount of indexing.
 """
 from elasticsearch import Elasticsearch
 from pymysqlreplication import BinLogStreamReader
 from pymysqlreplication.row_event import UpdateRowsEvent, DeleteRowsEvent, WriteRowsEvent
 from datetime import datetime
 from nyaa.models import TorrentFlags
 import sys
 import json
 import time
 import logging
 logging.basicConfig()
 log = logging.getLogger('sync_es')
 log.setLevel(logging.INFO)
 #logging.getLogger('elasticsearch').setLevel(logging.DEBUG)
 # in prod want in /var/lib somewhere probably
 SAVE_LOC = "/var/lib/sync_es_position.json"
 MYSQL_HOST = '127.0.0.1'
 MYSQL_PORT = 3306
 MYSQL_USER = 'test'
 MYSQL_PW = 'test123'
 NT_DB = 'nyaav2'
 with open(SAVE_LOC) as f:
    pos = json.load(f)
 es = Elasticsearch()
 stream = BinLogStreamReader(
        # TODO parse out from config.py or something
        connection_settings = {
            'host': MYSQL_HOST,
            'port': MYSQL_PORT,
            'user': MYSQL_USER,
            'passwd': MYSQL_PW
        },
        server_id=10, # arbitrary
        # only care about this database currently
        only_schemas=[NT_DB],
        # these tables in the database
        only_tables=["nyaa_torrents", "nyaa_statistics", "sukebei_torrents", "sukebei_statistics"],
        # from our save file
        resume_stream=True,
        log_file=pos['log_file'],
        log_pos=pos['log_pos'],
        # skip the other stuff like table mapping
        only_events=[UpdateRowsEvent, DeleteRowsEvent, WriteRowsEvent],
        # if we're at the head of the log, block until something happens
        # note it'd be nice to block async-style instead, but the mainline
        # binlogreader is synchronous. there is an (unmaintained?) fork
        # using aiomysql if anybody wants to revive that.
        blocking=True)
 def reindex_torrent(t, index_name):
    # XXX annoyingly different from import_to_es, and
    # you need to keep them in sync manually.
    f = t['flags']
    doc = {
        "id": t['id'],
        "display_name": t['display_name'],
        "created_time": t['created_time'],
        "updated_time": t['updated_time'],
        "description": t['description'],
        # not analyzed but included so we can render magnet links
        # without querying sql again.
        "info_hash": t['info_hash'].hex(),
        "filesize": t['filesize'],
        "uploader_id": t['uploader_id'],
        "main_category_id": t['main_category_id'],
        "sub_category_id": t['sub_category_id'],
        # XXX all the bitflags are numbers
        "anonymous": bool(f & TorrentFlags.ANONYMOUS),
        "trusted": bool(f & TorrentFlags.TRUSTED),
        "remake": bool(f & TorrentFlags.REMAKE),
        "complete": bool(f & TorrentFlags.COMPLETE),
        # TODO instead of indexing and filtering later
        # could delete from es entirely. Probably won't matter
        # for at least a few months.
        "hidden": bool(f & TorrentFlags.HIDDEN),
        "deleted": bool(f & TorrentFlags.DELETED),
        "has_torrent": bool(t['has_torrent']),
    }
    # update, so we don't delete the stats if present
    es.update(
        index=index_name,
        doc_type='torrent',
        id=t['id'],
        body={"doc": doc, "doc_as_upsert": True})
 def reindex_stats(s, index_name):
    es.update(
        index=index_name,
        doc_type='torrent',
        id=s['torrent_id'],
        body={
            "doc": {
                "stats_last_updated": s["last_updated"],
                "download_count": s["download_count"],
                "leech_count": s['leech_count'],
                "seed_count": s['seed_count'],
            }})
 n = 0
 last_save = time.time()
 for event in stream:
    for row in event.rows:
        if event.table == "nyaa_torrents" or event.table == "sukebei_torrents":
            if event.table == "nyaa_torrents":
                index_name = "nyaa"
            else:
                index_name = "sukebei"
            if type(event) is WriteRowsEvent:
                reindex_torrent(row['values'], index_name)
            elif type(event) is UpdateRowsEvent:
                reindex_torrent(row['after_values'], index_name)
            elif type(event) is DeleteRowsEvent:
                # just delete it
                es.delete(index=index_name, doc_type='torrent', id=row['values']['id'])
            else:
                raise Exception(f"unknown event {type(event)}")
        elif event.table == "nyaa_statistics" or event.table == "sukebei_statistics":
            if event.table == "nyaa_torrents":
                index_name = "nyaa"
            else:
                index_name = "sukebei"
            if type(event) is WriteRowsEvent:
                reindex_stats(row['values'], index_name)
            elif type(event) is UpdateRowsEvent:
                reindex_stats(row['after_values'], index_name)
            elif type(event) is DeleteRowsEvent:
                # uh ok. assume that the torrent row will get deleted later.
                pass
            else:
                raise Exception(f"unknown event {type(event)}")
        else:
          raise Exception(f"unknown table {s.table}")
    n += 1
    if n % 100 == 0 or time.time() - last_save > 30:
        log.info(f"saving position {stream.log_file}/{stream.log_pos}")
        with open(SAVE_LOC, 'w') as f:
            json.dump({"log_file": stream.log_file, "log_pos": stream.log_pos}, f)