Merge pull request #50 from qqueue/elasticsearchin

elasticsearch-based search (WIP)
This commit is contained in:
Johnny Ding 2017-05-14 20:06:33 -07:00 committed by GitHub
commit 8bca32a626
5 changed files with 184 additions and 15 deletions

94
import_to_es.py Normal file
View File

@ -0,0 +1,94 @@
#!/usr/bin/env python
"""
Bulk load torents from mysql into elasticsearch `nyaav2` index,
which is assumed to already exist.
This is a one-shot deal, so you'd either need to complement it
with a cron job or some binlog-reading thing (TODO)
"""
from nyaa.models import Torrent
from elasticsearch import Elasticsearch
from elasticsearch import helpers
import progressbar
import sys
bar = progressbar.ProgressBar(
max_value=Torrent.query.count(),
widgets=[
progressbar.SimpleProgress(),
' [', progressbar.Timer(), '] ',
progressbar.Bar(),
' (', progressbar.ETA(), ') ',
])
es = Elasticsearch()
# turn into thing that elasticsearch indexes. We flatten in
# the stats (seeders/leechers) so we can order by them in es naturally.
# we _don't_ dereference uploader_id to the user's display name however,
# instead doing that at query time. I _think_ this is right because
# we don't want to reindex all the user's torrents just because they
# changed their name, and we don't really want to FTS search on the user anyway.
# Maybe it's more convenient to derefence though.
def mk_es(t):
return {
"_id": t.id,
"_type": "torrent",
"_index": "nyaav2",
"_source": {
# we're also indexing the id as a number so you can
# order by it. seems like this is just equivalent to
# order by created_time, but oh well
"id": t.id,
"display_name": t.display_name,
"created_time": t.created_time,
"updated_time": t.updated_time,
"description": t.description,
# not analyzed but included so we can render magnet links
# without querying sql again.
"info_hash": t.info_hash.hex(),
"filesize": t.filesize,
"uploader_id": t.uploader_id,
"main_category_id": t.main_category_id,
"sub_category_id": t.sub_category_id,
# XXX all the bitflags are numbers
"anonymous": bool(t.anonymous),
"trusted": bool(t.trusted),
"remake": bool(t.remake),
"complete": bool(t.complete),
# TODO instead of indexing and filtering later
# could delete from es entirely. Probably won't matter
# for at least a few months.
"hidden": bool(t.hidden),
"deleted": bool(t.deleted),
"has_torrent": t.has_torrent,
# XXX last_updated isn't initialized
"stats_last_updated": t.stats.last_updated or t.created_time,
"download_count": t.stats.download_count,
"leech_count": t.stats.leech_count,
"seed_count": t.stats.seed_count,
}
}
# page through an sqlalchemy query, like the per_fetch but
# doesn't break the eager joins its doing against the stats table.
# annoying that this isn't built in somehow.
def page_query(query, limit=sys.maxsize, batch_size=10000):
start = 0
while True:
# XXX very inelegant way to do this, i'm confus
stop = min(limit, start + batch_size)
if stop == start:
break
things = query.slice(start, stop)
if not things:
break
had_things = False
for thing in things:
had_things = True
yield(thing)
if not had_things or stop == limit:
break
bar.update(start)
start = min(limit, start + batch_size)
helpers.bulk(es, (mk_es(t) for t in page_query(Torrent.query)), chunk_size=10000)

View File

@ -27,6 +27,11 @@ from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from email.utils import formatdate
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search, Q
es_client = Elasticsearch()
DEBUG_API = False
@ -67,6 +72,16 @@ def search(term='', user=None, sort='id', order='desc', category='0_0', quality_
sort_ = sort.lower()
if sort_ not in sort_keys:
flask.abort(400)
# XXX gross why are all the names subtly different
es_sort = ({
'id': 'id',
'size': 'filesize',
'name': 'display_name',
'seeders': 'seed_count',
'leechers': 'leech_count',
'downloads': 'download_count'
})[sort]
sort = sort_keys[sort]
order_keys = {
@ -78,6 +93,10 @@ def search(term='', user=None, sort='id', order='desc', category='0_0', quality_
if order_ not in order_keys:
flask.abort(400)
# funky, es sort is default asc, prefixed by '-' if desc
if "desc" == order:
es_sort = "-" + es_sort
filter_keys = {
'0': None,
'1': (models.TorrentFlags.REMAKE, False),
@ -126,28 +145,37 @@ def search(term='', user=None, sort='id', order='desc', category='0_0', quality_
if flask.g.user:
same_user = flask.g.user.id == user
s = Search(using=es_client, index='nyaav2')
if term:
query = db.session.query(models.TorrentNameSearch)
s = s.query("query_string", default_field="display_name", default_operator="AND", query=term)
else:
query = models.Torrent.query
# Filter by user
if user:
s = s.filter("term", uploader_id=user)
query = query.filter(models.Torrent.uploader_id == user)
# If admin, show everything
if not admin:
# If user is not logged in or the accessed feed doesn't belong to user,
# hide anonymous torrents belonging to the queried user
if not same_user:
# TODO adapt to es syntax
query = query.filter(models.Torrent.flags.op('&')(
int(models.TorrentFlags.ANONYMOUS | models.TorrentFlags.DELETED)).is_(False))
if main_category:
s = s.filter("term", main_category_id=main_cat_id)
query = query.filter(models.Torrent.main_category_id == main_cat_id)
elif sub_category:
s = s.filter("term", main_category_id=main_cat_id)
s = s.filter("term", sub_category_id=sub_cat_id)
query = query.filter((models.Torrent.main_category_id == main_cat_id) &
(models.Torrent.sub_category_id == sub_cat_id))
# TODO i dunno what this means in es
if filter_tuple:
query = query.filter(models.Torrent.flags.op('&')(int(filter_tuple[0])).is_(filter_tuple[1]))
@ -157,6 +185,7 @@ def search(term='', user=None, sort='id', order='desc', category='0_0', quality_
int(models.TorrentFlags.HIDDEN | models.TorrentFlags.DELETED)).is_(False))
if term:
# note already handled in es
for item in shlex.split(term, posix=False):
if len(item) >= 2:
query = query.filter(FullTextSearch(
@ -166,14 +195,25 @@ def search(term='', user=None, sort='id', order='desc', category='0_0', quality_
if sort.class_ != models.Torrent:
query = query.join(sort.class_)
s = s.sort(es_sort)
query = query.order_by(getattr(sort, order)())
per = app.config['RESULTS_PER_PAGE']
if rss:
query = query.limit(app.config['RESULTS_PER_PAGE'])
pass
#query = query.limit(app.config['RESULTS_PER_PAGE'])
else:
query = query.paginate_faste(page, per_page=app.config['RESULTS_PER_PAGE'], step=5)
# page is 1-based?
s = s[(page-1)*per:page*per]
#query = query.paginate_faste(page, per_page=app.config['RESULTS_PER_PAGE'], step=5)
return query
s = s.highlight_options(tags_schema='styled')
s = s.highlight("display_name")
#return query
from pprint import pprint
print(json.dumps(s.to_dict()))
return s.execute()
@app.errorhandler(404)
@ -445,6 +485,7 @@ def activate_user(payload):
user.status = models.UserStatusType.ACTIVE
db.session.add(user)
db.session.commit()

View File

@ -97,4 +97,14 @@ table.torrent-list thead th.sorting_desc:after {
margin-left: 20px;
margin-bottom: 10px;
}
}
}
/* elasticsearch term highlight */
.hlt1 {
font-style: normal;
display: inline-block;
padding: 0 3px;
border-radius: 3px;
border: 1px solid rgba(100, 56, 0, 0.8);
background: rgba(200,127,0,0.3);
}

View File

@ -8,7 +8,7 @@
{{ caller() }}
</th>
{% endmacro %}
{% if torrent_query.items %}
{% if torrent_query.hits.total > 0 %}
<div class="table-responsive">
<table class="table table-bordered table-hover table-striped torrent-list">
<thead>
@ -45,26 +45,26 @@
</tr>
</thead>
<tbody>
{% for torrent in torrent_query.items %}
{% for torrent in torrent_query %}
<tr class="{% if torrent.deleted %}deleted{% elif torrent.hidden %}warning{% elif torrent.remake %}danger{% elif torrent.trusted %}success{% else %}default{% endif %}">
{% set cat_id = (torrent.main_category.id|string) + '_' + (torrent.sub_category.id|string) %}
{% set cat_id = (torrent.main_category_id|string) + '_' + (torrent.sub_category_id|string) %}
{% set icon_dir = config.SITE_FLAVOR %}
<td style="padding:0 4px;">
<a href="/?c={{ cat_id }}" title="{{ torrent.main_category.name }} - {{ torrent.sub_category.name }}">
<a href="/?c={{ cat_id }}" title="{{ torrent.main_category_id }} - {{ torrent.sub_category_id }}">
<img src="/static/img/icons/{{ icon_dir }}/{{ cat_id }}.png">
</a>
</td>
<td><a href="{{ url_for('view_torrent', torrent_id=torrent.id) }}">{{ torrent.display_name | escape }}</a></td>
<td><a href="{{ url_for('view_torrent', torrent_id=torrent.meta.id) }}">{{ torrent.meta.highlight.display_name[0] | safe }}</a></td>
<td style="white-space: nowrap;text-align: center;">
{% if torrent.has_torrent %}<a href="{{ url_for('download_torrent', torrent_id=torrent.id) }}"><i class="fa fa-fw fa-download"></i></a>{% endif %}
<a href="{{ torrent.magnet_uri }}"><i class="fa fa-fw fa-magnet"></i></a>
</td>
<td class="text-center">{{ torrent.filesize | filesizeformat(True) }}</td>
<td class="text-center" data-timestamp="{{ torrent.created_utc_timestamp|int }}">{{ torrent.created_time.strftime('%Y-%m-%d %H:%M') }}</td>
<td class="text-center" {#data-timestamp="{{ torrent.created_time|int }}"#}>{{ torrent.created_time }}</td>
{% if config.ENABLE_SHOW_STATS %}
<td class="text-center" style="color: green;">{{ torrent.stats.seed_count }}</td>
<td class="text-center" style="color: red;">{{ torrent.stats.leech_count }}</td>
<td class="text-center">{{ torrent.stats.download_count }}</td>
<td class="text-center" style="color: green;">{{ torrent.seed_count }}</td>
<td class="text-center" style="color: red;">{{ torrent.leech_count }}</td>
<td class="text-center">{{ torrent.download_count }}</td>
{% endif %}
</tr>
{% endfor %}
@ -75,7 +75,9 @@
<h3>No results found</h3>
{% endif %}
{#
<center>
{% from "bootstrap/pagination.html" import render_pagination %}
{{ render_pagination(torrent_query) }}
</center>
#}

View File

@ -24,11 +24,33 @@ pycodestyle==2.3.1
pycparser==2.17
pyparsing==2.2.0
six==1.10.0
SQLAlchemy>=1.1.9
SQLAlchemy==1.1.9
SQLAlchemy-FullText-Search==0.2.3
SQLAlchemy-Utils>=0.32.14
SQLAlchemy-Utils==0.32.14
uWSGI==2.0.15
visitor==0.1.3
webassets==0.12.1
Werkzeug==0.12.1
WTForms==2.1
## The following requirements were added by pip freeze:
decorator==4.0.11
elasticsearch==5.3.0
elasticsearch-dsl==5.2.0
ipython==6.0.0
ipython-genutils==0.2.0
jedi==0.10.2
mysql-replication==0.13
pexpect==4.2.1
pickleshare==0.7.4
pkg-resources==0.0.0
progressbar2==3.20.0
prompt-toolkit==1.0.14
ptyprocess==0.5.1
Pygments==2.2.0
PyMySQL==0.7.11
python-dateutil==2.6.0
python-utils==2.1.0
simplegeneric==0.8.1
traitlets==4.3.2
urllib3==1.21.1
wcwidth==0.1.7