mirror of
https://gitlab.com/SIGBUS/nyaa.git
synced 2024-12-22 10:59:59 +00:00
Merge pull request #50 from qqueue/elasticsearchin
elasticsearch-based search (WIP)
This commit is contained in:
commit
8bca32a626
94
import_to_es.py
Normal file
94
import_to_es.py
Normal file
|
@ -0,0 +1,94 @@
|
|||
#!/usr/bin/env python
|
||||
"""
|
||||
Bulk load torents from mysql into elasticsearch `nyaav2` index,
|
||||
which is assumed to already exist.
|
||||
This is a one-shot deal, so you'd either need to complement it
|
||||
with a cron job or some binlog-reading thing (TODO)
|
||||
"""
|
||||
from nyaa.models import Torrent
|
||||
from elasticsearch import Elasticsearch
|
||||
from elasticsearch import helpers
|
||||
import progressbar
|
||||
import sys
|
||||
|
||||
bar = progressbar.ProgressBar(
|
||||
max_value=Torrent.query.count(),
|
||||
widgets=[
|
||||
progressbar.SimpleProgress(),
|
||||
' [', progressbar.Timer(), '] ',
|
||||
progressbar.Bar(),
|
||||
' (', progressbar.ETA(), ') ',
|
||||
])
|
||||
|
||||
es = Elasticsearch()
|
||||
|
||||
# turn into thing that elasticsearch indexes. We flatten in
|
||||
# the stats (seeders/leechers) so we can order by them in es naturally.
|
||||
# we _don't_ dereference uploader_id to the user's display name however,
|
||||
# instead doing that at query time. I _think_ this is right because
|
||||
# we don't want to reindex all the user's torrents just because they
|
||||
# changed their name, and we don't really want to FTS search on the user anyway.
|
||||
# Maybe it's more convenient to derefence though.
|
||||
def mk_es(t):
|
||||
return {
|
||||
"_id": t.id,
|
||||
"_type": "torrent",
|
||||
"_index": "nyaav2",
|
||||
"_source": {
|
||||
# we're also indexing the id as a number so you can
|
||||
# order by it. seems like this is just equivalent to
|
||||
# order by created_time, but oh well
|
||||
"id": t.id,
|
||||
"display_name": t.display_name,
|
||||
"created_time": t.created_time,
|
||||
"updated_time": t.updated_time,
|
||||
"description": t.description,
|
||||
# not analyzed but included so we can render magnet links
|
||||
# without querying sql again.
|
||||
"info_hash": t.info_hash.hex(),
|
||||
"filesize": t.filesize,
|
||||
"uploader_id": t.uploader_id,
|
||||
"main_category_id": t.main_category_id,
|
||||
"sub_category_id": t.sub_category_id,
|
||||
# XXX all the bitflags are numbers
|
||||
"anonymous": bool(t.anonymous),
|
||||
"trusted": bool(t.trusted),
|
||||
"remake": bool(t.remake),
|
||||
"complete": bool(t.complete),
|
||||
# TODO instead of indexing and filtering later
|
||||
# could delete from es entirely. Probably won't matter
|
||||
# for at least a few months.
|
||||
"hidden": bool(t.hidden),
|
||||
"deleted": bool(t.deleted),
|
||||
"has_torrent": t.has_torrent,
|
||||
# XXX last_updated isn't initialized
|
||||
"stats_last_updated": t.stats.last_updated or t.created_time,
|
||||
"download_count": t.stats.download_count,
|
||||
"leech_count": t.stats.leech_count,
|
||||
"seed_count": t.stats.seed_count,
|
||||
}
|
||||
}
|
||||
|
||||
# page through an sqlalchemy query, like the per_fetch but
|
||||
# doesn't break the eager joins its doing against the stats table.
|
||||
# annoying that this isn't built in somehow.
|
||||
def page_query(query, limit=sys.maxsize, batch_size=10000):
|
||||
start = 0
|
||||
while True:
|
||||
# XXX very inelegant way to do this, i'm confus
|
||||
stop = min(limit, start + batch_size)
|
||||
if stop == start:
|
||||
break
|
||||
things = query.slice(start, stop)
|
||||
if not things:
|
||||
break
|
||||
had_things = False
|
||||
for thing in things:
|
||||
had_things = True
|
||||
yield(thing)
|
||||
if not had_things or stop == limit:
|
||||
break
|
||||
bar.update(start)
|
||||
start = min(limit, start + batch_size)
|
||||
|
||||
helpers.bulk(es, (mk_es(t) for t in page_query(Torrent.query)), chunk_size=10000)
|
|
@ -27,6 +27,11 @@ from email.mime.multipart import MIMEMultipart
|
|||
from email.mime.text import MIMEText
|
||||
from email.utils import formatdate
|
||||
|
||||
from elasticsearch import Elasticsearch
|
||||
from elasticsearch_dsl import Search, Q
|
||||
|
||||
es_client = Elasticsearch()
|
||||
|
||||
DEBUG_API = False
|
||||
|
||||
|
||||
|
@ -67,6 +72,16 @@ def search(term='', user=None, sort='id', order='desc', category='0_0', quality_
|
|||
sort_ = sort.lower()
|
||||
if sort_ not in sort_keys:
|
||||
flask.abort(400)
|
||||
|
||||
# XXX gross why are all the names subtly different
|
||||
es_sort = ({
|
||||
'id': 'id',
|
||||
'size': 'filesize',
|
||||
'name': 'display_name',
|
||||
'seeders': 'seed_count',
|
||||
'leechers': 'leech_count',
|
||||
'downloads': 'download_count'
|
||||
})[sort]
|
||||
sort = sort_keys[sort]
|
||||
|
||||
order_keys = {
|
||||
|
@ -78,6 +93,10 @@ def search(term='', user=None, sort='id', order='desc', category='0_0', quality_
|
|||
if order_ not in order_keys:
|
||||
flask.abort(400)
|
||||
|
||||
# funky, es sort is default asc, prefixed by '-' if desc
|
||||
if "desc" == order:
|
||||
es_sort = "-" + es_sort
|
||||
|
||||
filter_keys = {
|
||||
'0': None,
|
||||
'1': (models.TorrentFlags.REMAKE, False),
|
||||
|
@ -126,28 +145,37 @@ def search(term='', user=None, sort='id', order='desc', category='0_0', quality_
|
|||
if flask.g.user:
|
||||
same_user = flask.g.user.id == user
|
||||
|
||||
s = Search(using=es_client, index='nyaav2')
|
||||
if term:
|
||||
query = db.session.query(models.TorrentNameSearch)
|
||||
s = s.query("query_string", default_field="display_name", default_operator="AND", query=term)
|
||||
else:
|
||||
query = models.Torrent.query
|
||||
|
||||
# Filter by user
|
||||
if user:
|
||||
s = s.filter("term", uploader_id=user)
|
||||
|
||||
query = query.filter(models.Torrent.uploader_id == user)
|
||||
# If admin, show everything
|
||||
if not admin:
|
||||
# If user is not logged in or the accessed feed doesn't belong to user,
|
||||
# hide anonymous torrents belonging to the queried user
|
||||
if not same_user:
|
||||
# TODO adapt to es syntax
|
||||
query = query.filter(models.Torrent.flags.op('&')(
|
||||
int(models.TorrentFlags.ANONYMOUS | models.TorrentFlags.DELETED)).is_(False))
|
||||
|
||||
if main_category:
|
||||
s = s.filter("term", main_category_id=main_cat_id)
|
||||
query = query.filter(models.Torrent.main_category_id == main_cat_id)
|
||||
elif sub_category:
|
||||
s = s.filter("term", main_category_id=main_cat_id)
|
||||
s = s.filter("term", sub_category_id=sub_cat_id)
|
||||
query = query.filter((models.Torrent.main_category_id == main_cat_id) &
|
||||
(models.Torrent.sub_category_id == sub_cat_id))
|
||||
|
||||
# TODO i dunno what this means in es
|
||||
if filter_tuple:
|
||||
query = query.filter(models.Torrent.flags.op('&')(int(filter_tuple[0])).is_(filter_tuple[1]))
|
||||
|
||||
|
@ -157,6 +185,7 @@ def search(term='', user=None, sort='id', order='desc', category='0_0', quality_
|
|||
int(models.TorrentFlags.HIDDEN | models.TorrentFlags.DELETED)).is_(False))
|
||||
|
||||
if term:
|
||||
# note already handled in es
|
||||
for item in shlex.split(term, posix=False):
|
||||
if len(item) >= 2:
|
||||
query = query.filter(FullTextSearch(
|
||||
|
@ -166,14 +195,25 @@ def search(term='', user=None, sort='id', order='desc', category='0_0', quality_
|
|||
if sort.class_ != models.Torrent:
|
||||
query = query.join(sort.class_)
|
||||
|
||||
s = s.sort(es_sort)
|
||||
query = query.order_by(getattr(sort, order)())
|
||||
|
||||
per = app.config['RESULTS_PER_PAGE']
|
||||
if rss:
|
||||
query = query.limit(app.config['RESULTS_PER_PAGE'])
|
||||
pass
|
||||
#query = query.limit(app.config['RESULTS_PER_PAGE'])
|
||||
else:
|
||||
query = query.paginate_faste(page, per_page=app.config['RESULTS_PER_PAGE'], step=5)
|
||||
# page is 1-based?
|
||||
s = s[(page-1)*per:page*per]
|
||||
#query = query.paginate_faste(page, per_page=app.config['RESULTS_PER_PAGE'], step=5)
|
||||
|
||||
return query
|
||||
s = s.highlight_options(tags_schema='styled')
|
||||
s = s.highlight("display_name")
|
||||
|
||||
#return query
|
||||
from pprint import pprint
|
||||
print(json.dumps(s.to_dict()))
|
||||
return s.execute()
|
||||
|
||||
|
||||
@app.errorhandler(404)
|
||||
|
@ -445,6 +485,7 @@ def activate_user(payload):
|
|||
|
||||
user.status = models.UserStatusType.ACTIVE
|
||||
|
||||
|
||||
db.session.add(user)
|
||||
db.session.commit()
|
||||
|
||||
|
|
|
@ -97,4 +97,14 @@ table.torrent-list thead th.sorting_desc:after {
|
|||
margin-left: 20px;
|
||||
margin-bottom: 10px;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* elasticsearch term highlight */
|
||||
.hlt1 {
|
||||
font-style: normal;
|
||||
display: inline-block;
|
||||
padding: 0 3px;
|
||||
border-radius: 3px;
|
||||
border: 1px solid rgba(100, 56, 0, 0.8);
|
||||
background: rgba(200,127,0,0.3);
|
||||
}
|
||||
|
|
|
@ -8,7 +8,7 @@
|
|||
{{ caller() }}
|
||||
</th>
|
||||
{% endmacro %}
|
||||
{% if torrent_query.items %}
|
||||
{% if torrent_query.hits.total > 0 %}
|
||||
<div class="table-responsive">
|
||||
<table class="table table-bordered table-hover table-striped torrent-list">
|
||||
<thead>
|
||||
|
@ -45,26 +45,26 @@
|
|||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for torrent in torrent_query.items %}
|
||||
{% for torrent in torrent_query %}
|
||||
<tr class="{% if torrent.deleted %}deleted{% elif torrent.hidden %}warning{% elif torrent.remake %}danger{% elif torrent.trusted %}success{% else %}default{% endif %}">
|
||||
{% set cat_id = (torrent.main_category.id|string) + '_' + (torrent.sub_category.id|string) %}
|
||||
{% set cat_id = (torrent.main_category_id|string) + '_' + (torrent.sub_category_id|string) %}
|
||||
{% set icon_dir = config.SITE_FLAVOR %}
|
||||
<td style="padding:0 4px;">
|
||||
<a href="/?c={{ cat_id }}" title="{{ torrent.main_category.name }} - {{ torrent.sub_category.name }}">
|
||||
<a href="/?c={{ cat_id }}" title="{{ torrent.main_category_id }} - {{ torrent.sub_category_id }}">
|
||||
<img src="/static/img/icons/{{ icon_dir }}/{{ cat_id }}.png">
|
||||
</a>
|
||||
</td>
|
||||
<td><a href="{{ url_for('view_torrent', torrent_id=torrent.id) }}">{{ torrent.display_name | escape }}</a></td>
|
||||
<td><a href="{{ url_for('view_torrent', torrent_id=torrent.meta.id) }}">{{ torrent.meta.highlight.display_name[0] | safe }}</a></td>
|
||||
<td style="white-space: nowrap;text-align: center;">
|
||||
{% if torrent.has_torrent %}<a href="{{ url_for('download_torrent', torrent_id=torrent.id) }}"><i class="fa fa-fw fa-download"></i></a>{% endif %}
|
||||
<a href="{{ torrent.magnet_uri }}"><i class="fa fa-fw fa-magnet"></i></a>
|
||||
</td>
|
||||
<td class="text-center">{{ torrent.filesize | filesizeformat(True) }}</td>
|
||||
<td class="text-center" data-timestamp="{{ torrent.created_utc_timestamp|int }}">{{ torrent.created_time.strftime('%Y-%m-%d %H:%M') }}</td>
|
||||
<td class="text-center" {#data-timestamp="{{ torrent.created_time|int }}"#}>{{ torrent.created_time }}</td>
|
||||
{% if config.ENABLE_SHOW_STATS %}
|
||||
<td class="text-center" style="color: green;">{{ torrent.stats.seed_count }}</td>
|
||||
<td class="text-center" style="color: red;">{{ torrent.stats.leech_count }}</td>
|
||||
<td class="text-center">{{ torrent.stats.download_count }}</td>
|
||||
<td class="text-center" style="color: green;">{{ torrent.seed_count }}</td>
|
||||
<td class="text-center" style="color: red;">{{ torrent.leech_count }}</td>
|
||||
<td class="text-center">{{ torrent.download_count }}</td>
|
||||
{% endif %}
|
||||
</tr>
|
||||
{% endfor %}
|
||||
|
@ -75,7 +75,9 @@
|
|||
<h3>No results found</h3>
|
||||
{% endif %}
|
||||
|
||||
{#
|
||||
<center>
|
||||
{% from "bootstrap/pagination.html" import render_pagination %}
|
||||
{{ render_pagination(torrent_query) }}
|
||||
</center>
|
||||
#}
|
||||
|
|
|
@ -24,11 +24,33 @@ pycodestyle==2.3.1
|
|||
pycparser==2.17
|
||||
pyparsing==2.2.0
|
||||
six==1.10.0
|
||||
SQLAlchemy>=1.1.9
|
||||
SQLAlchemy==1.1.9
|
||||
SQLAlchemy-FullText-Search==0.2.3
|
||||
SQLAlchemy-Utils>=0.32.14
|
||||
SQLAlchemy-Utils==0.32.14
|
||||
uWSGI==2.0.15
|
||||
visitor==0.1.3
|
||||
webassets==0.12.1
|
||||
Werkzeug==0.12.1
|
||||
WTForms==2.1
|
||||
## The following requirements were added by pip freeze:
|
||||
decorator==4.0.11
|
||||
elasticsearch==5.3.0
|
||||
elasticsearch-dsl==5.2.0
|
||||
ipython==6.0.0
|
||||
ipython-genutils==0.2.0
|
||||
jedi==0.10.2
|
||||
mysql-replication==0.13
|
||||
pexpect==4.2.1
|
||||
pickleshare==0.7.4
|
||||
pkg-resources==0.0.0
|
||||
progressbar2==3.20.0
|
||||
prompt-toolkit==1.0.14
|
||||
ptyprocess==0.5.1
|
||||
Pygments==2.2.0
|
||||
PyMySQL==0.7.11
|
||||
python-dateutil==2.6.0
|
||||
python-utils==2.1.0
|
||||
simplegeneric==0.8.1
|
||||
traitlets==4.3.2
|
||||
urllib3==1.21.1
|
||||
wcwidth==0.1.7
|
||||
|
|
Loading…
Reference in a new issue