diff --git a/import_to_es.py b/import_to_es.py new file mode 100644 index 0000000..e714da5 --- /dev/null +++ b/import_to_es.py @@ -0,0 +1,94 @@ +#!/usr/bin/env python +""" +Bulk load torents from mysql into elasticsearch `nyaav2` index, +which is assumed to already exist. +This is a one-shot deal, so you'd either need to complement it +with a cron job or some binlog-reading thing (TODO) +""" +from nyaa.models import Torrent +from elasticsearch import Elasticsearch +from elasticsearch import helpers +import progressbar +import sys + +bar = progressbar.ProgressBar( + max_value=Torrent.query.count(), + widgets=[ + progressbar.SimpleProgress(), + ' [', progressbar.Timer(), '] ', + progressbar.Bar(), + ' (', progressbar.ETA(), ') ', + ]) + +es = Elasticsearch() + +# turn into thing that elasticsearch indexes. We flatten in +# the stats (seeders/leechers) so we can order by them in es naturally. +# we _don't_ dereference uploader_id to the user's display name however, +# instead doing that at query time. I _think_ this is right because +# we don't want to reindex all the user's torrents just because they +# changed their name, and we don't really want to FTS search on the user anyway. +# Maybe it's more convenient to derefence though. +def mk_es(t): + return { + "_id": t.id, + "_type": "torrent", + "_index": "nyaav2", + "_source": { + # we're also indexing the id as a number so you can + # order by it. seems like this is just equivalent to + # order by created_time, but oh well + "id": t.id, + "display_name": t.display_name, + "created_time": t.created_time, + "updated_time": t.updated_time, + "description": t.description, + # not analyzed but included so we can render magnet links + # without querying sql again. + "info_hash": t.info_hash.hex(), + "filesize": t.filesize, + "uploader_id": t.uploader_id, + "main_category_id": t.main_category_id, + "sub_category_id": t.sub_category_id, + # XXX all the bitflags are numbers + "anonymous": bool(t.anonymous), + "trusted": bool(t.trusted), + "remake": bool(t.remake), + "complete": bool(t.complete), + # TODO instead of indexing and filtering later + # could delete from es entirely. Probably won't matter + # for at least a few months. + "hidden": bool(t.hidden), + "deleted": bool(t.deleted), + "has_torrent": t.has_torrent, + # XXX last_updated isn't initialized + "stats_last_updated": t.stats.last_updated or t.created_time, + "download_count": t.stats.download_count, + "leech_count": t.stats.leech_count, + "seed_count": t.stats.seed_count, + } + } + +# page through an sqlalchemy query, like the per_fetch but +# doesn't break the eager joins its doing against the stats table. +# annoying that this isn't built in somehow. +def page_query(query, limit=sys.maxsize, batch_size=10000): + start = 0 + while True: + # XXX very inelegant way to do this, i'm confus + stop = min(limit, start + batch_size) + if stop == start: + break + things = query.slice(start, stop) + if not things: + break + had_things = False + for thing in things: + had_things = True + yield(thing) + if not had_things or stop == limit: + break + bar.update(start) + start = min(limit, start + batch_size) + +helpers.bulk(es, (mk_es(t) for t in page_query(Torrent.query)), chunk_size=10000) diff --git a/nyaa/routes.py b/nyaa/routes.py index 4064b15..3e87a2a 100644 --- a/nyaa/routes.py +++ b/nyaa/routes.py @@ -27,6 +27,11 @@ from email.mime.multipart import MIMEMultipart from email.mime.text import MIMEText from email.utils import formatdate +from elasticsearch import Elasticsearch +from elasticsearch_dsl import Search, Q + +es_client = Elasticsearch() + DEBUG_API = False @@ -67,6 +72,16 @@ def search(term='', user=None, sort='id', order='desc', category='0_0', quality_ sort_ = sort.lower() if sort_ not in sort_keys: flask.abort(400) + + # XXX gross why are all the names subtly different + es_sort = ({ + 'id': 'id', + 'size': 'filesize', + 'name': 'display_name', + 'seeders': 'seed_count', + 'leechers': 'leech_count', + 'downloads': 'download_count' + })[sort] sort = sort_keys[sort] order_keys = { @@ -78,6 +93,10 @@ def search(term='', user=None, sort='id', order='desc', category='0_0', quality_ if order_ not in order_keys: flask.abort(400) + # funky, es sort is default asc, prefixed by '-' if desc + if "desc" == order: + es_sort = "-" + es_sort + filter_keys = { '0': None, '1': (models.TorrentFlags.REMAKE, False), @@ -126,28 +145,37 @@ def search(term='', user=None, sort='id', order='desc', category='0_0', quality_ if flask.g.user: same_user = flask.g.user.id == user + s = Search(using=es_client, index='nyaav2') if term: query = db.session.query(models.TorrentNameSearch) + s = s.query("query_string", default_field="display_name", default_operator="AND", query=term) else: query = models.Torrent.query # Filter by user if user: + s = s.filter("term", uploader_id=user) + query = query.filter(models.Torrent.uploader_id == user) # If admin, show everything if not admin: # If user is not logged in or the accessed feed doesn't belong to user, # hide anonymous torrents belonging to the queried user if not same_user: + # TODO adapt to es syntax query = query.filter(models.Torrent.flags.op('&')( int(models.TorrentFlags.ANONYMOUS | models.TorrentFlags.DELETED)).is_(False)) if main_category: + s = s.filter("term", main_category_id=main_cat_id) query = query.filter(models.Torrent.main_category_id == main_cat_id) elif sub_category: + s = s.filter("term", main_category_id=main_cat_id) + s = s.filter("term", sub_category_id=sub_cat_id) query = query.filter((models.Torrent.main_category_id == main_cat_id) & (models.Torrent.sub_category_id == sub_cat_id)) + # TODO i dunno what this means in es if filter_tuple: query = query.filter(models.Torrent.flags.op('&')(int(filter_tuple[0])).is_(filter_tuple[1])) @@ -157,6 +185,7 @@ def search(term='', user=None, sort='id', order='desc', category='0_0', quality_ int(models.TorrentFlags.HIDDEN | models.TorrentFlags.DELETED)).is_(False)) if term: + # note already handled in es for item in shlex.split(term, posix=False): if len(item) >= 2: query = query.filter(FullTextSearch( @@ -166,14 +195,25 @@ def search(term='', user=None, sort='id', order='desc', category='0_0', quality_ if sort.class_ != models.Torrent: query = query.join(sort.class_) + s = s.sort(es_sort) query = query.order_by(getattr(sort, order)()) + per = app.config['RESULTS_PER_PAGE'] if rss: - query = query.limit(app.config['RESULTS_PER_PAGE']) + pass + #query = query.limit(app.config['RESULTS_PER_PAGE']) else: - query = query.paginate_faste(page, per_page=app.config['RESULTS_PER_PAGE'], step=5) + # page is 1-based? + s = s[(page-1)*per:page*per] + #query = query.paginate_faste(page, per_page=app.config['RESULTS_PER_PAGE'], step=5) - return query + s = s.highlight_options(tags_schema='styled') + s = s.highlight("display_name") + + #return query + from pprint import pprint + print(json.dumps(s.to_dict())) + return s.execute() @app.errorhandler(404) @@ -445,6 +485,7 @@ def activate_user(payload): user.status = models.UserStatusType.ACTIVE + db.session.add(user) db.session.commit() diff --git a/nyaa/static/css/main.css b/nyaa/static/css/main.css index 83ca0ea..1743595 100644 --- a/nyaa/static/css/main.css +++ b/nyaa/static/css/main.css @@ -97,4 +97,14 @@ table.torrent-list thead th.sorting_desc:after { margin-left: 20px; margin-bottom: 10px; } -} \ No newline at end of file +} + +/* elasticsearch term highlight */ +.hlt1 { + font-style: normal; + display: inline-block; + padding: 0 3px; + border-radius: 3px; + border: 1px solid rgba(100, 56, 0, 0.8); + background: rgba(200,127,0,0.3); +} diff --git a/nyaa/templates/search_results.html b/nyaa/templates/search_results.html index 52c27f0..23f9a40 100644 --- a/nyaa/templates/search_results.html +++ b/nyaa/templates/search_results.html @@ -8,7 +8,7 @@ {{ caller() }} {% endmacro %} -{% if torrent_query.items %} +{% if torrent_query.hits.total > 0 %}
@@ -45,26 +45,26 @@ - {% for torrent in torrent_query.items %} + {% for torrent in torrent_query %} - {% set cat_id = (torrent.main_category.id|string) + '_' + (torrent.sub_category.id|string) %} + {% set cat_id = (torrent.main_category_id|string) + '_' + (torrent.sub_category_id|string) %} {% set icon_dir = config.SITE_FLAVOR %} - + - + {% if config.ENABLE_SHOW_STATS %} - - - + + + {% endif %} {% endfor %} @@ -75,7 +75,9 @@

No results found

{% endif %} +{#
{% from "bootstrap/pagination.html" import render_pagination %} {{ render_pagination(torrent_query) }}
+#} diff --git a/requirements.txt b/requirements.txt index 224866b..dbf234d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -24,11 +24,33 @@ pycodestyle==2.3.1 pycparser==2.17 pyparsing==2.2.0 six==1.10.0 -SQLAlchemy>=1.1.9 +SQLAlchemy==1.1.9 SQLAlchemy-FullText-Search==0.2.3 -SQLAlchemy-Utils>=0.32.14 +SQLAlchemy-Utils==0.32.14 uWSGI==2.0.15 visitor==0.1.3 webassets==0.12.1 Werkzeug==0.12.1 WTForms==2.1 +## The following requirements were added by pip freeze: +decorator==4.0.11 +elasticsearch==5.3.0 +elasticsearch-dsl==5.2.0 +ipython==6.0.0 +ipython-genutils==0.2.0 +jedi==0.10.2 +mysql-replication==0.13 +pexpect==4.2.1 +pickleshare==0.7.4 +pkg-resources==0.0.0 +progressbar2==3.20.0 +prompt-toolkit==1.0.14 +ptyprocess==0.5.1 +Pygments==2.2.0 +PyMySQL==0.7.11 +python-dateutil==2.6.0 +python-utils==2.1.0 +simplegeneric==0.8.1 +traitlets==4.3.2 +urllib3==1.21.1 +wcwidth==0.1.7
- + {{ torrent.display_name | escape }}{{ torrent.meta.highlight.display_name[0] | safe }} {% if torrent.has_torrent %}{% endif %} {{ torrent.filesize | filesizeformat(True) }}{{ torrent.created_time.strftime('%Y-%m-%d %H:%M') }}{{ torrent.created_time }}{{ torrent.stats.seed_count }}{{ torrent.stats.leech_count }}{{ torrent.stats.download_count }}{{ torrent.seed_count }}{{ torrent.leech_count }}{{ torrent.download_count }}