mirror of
https://gitlab.com/SIGBUS/nyaa.git
synced 2024-12-22 11:10:00 +00:00
Merge pull request #50 from qqueue/elasticsearchin
elasticsearch-based search (WIP)
This commit is contained in:
commit
8bca32a626
94
import_to_es.py
Normal file
94
import_to_es.py
Normal file
|
@ -0,0 +1,94 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
"""
|
||||||
|
Bulk load torents from mysql into elasticsearch `nyaav2` index,
|
||||||
|
which is assumed to already exist.
|
||||||
|
This is a one-shot deal, so you'd either need to complement it
|
||||||
|
with a cron job or some binlog-reading thing (TODO)
|
||||||
|
"""
|
||||||
|
from nyaa.models import Torrent
|
||||||
|
from elasticsearch import Elasticsearch
|
||||||
|
from elasticsearch import helpers
|
||||||
|
import progressbar
|
||||||
|
import sys
|
||||||
|
|
||||||
|
bar = progressbar.ProgressBar(
|
||||||
|
max_value=Torrent.query.count(),
|
||||||
|
widgets=[
|
||||||
|
progressbar.SimpleProgress(),
|
||||||
|
' [', progressbar.Timer(), '] ',
|
||||||
|
progressbar.Bar(),
|
||||||
|
' (', progressbar.ETA(), ') ',
|
||||||
|
])
|
||||||
|
|
||||||
|
es = Elasticsearch()
|
||||||
|
|
||||||
|
# turn into thing that elasticsearch indexes. We flatten in
|
||||||
|
# the stats (seeders/leechers) so we can order by them in es naturally.
|
||||||
|
# we _don't_ dereference uploader_id to the user's display name however,
|
||||||
|
# instead doing that at query time. I _think_ this is right because
|
||||||
|
# we don't want to reindex all the user's torrents just because they
|
||||||
|
# changed their name, and we don't really want to FTS search on the user anyway.
|
||||||
|
# Maybe it's more convenient to derefence though.
|
||||||
|
def mk_es(t):
|
||||||
|
return {
|
||||||
|
"_id": t.id,
|
||||||
|
"_type": "torrent",
|
||||||
|
"_index": "nyaav2",
|
||||||
|
"_source": {
|
||||||
|
# we're also indexing the id as a number so you can
|
||||||
|
# order by it. seems like this is just equivalent to
|
||||||
|
# order by created_time, but oh well
|
||||||
|
"id": t.id,
|
||||||
|
"display_name": t.display_name,
|
||||||
|
"created_time": t.created_time,
|
||||||
|
"updated_time": t.updated_time,
|
||||||
|
"description": t.description,
|
||||||
|
# not analyzed but included so we can render magnet links
|
||||||
|
# without querying sql again.
|
||||||
|
"info_hash": t.info_hash.hex(),
|
||||||
|
"filesize": t.filesize,
|
||||||
|
"uploader_id": t.uploader_id,
|
||||||
|
"main_category_id": t.main_category_id,
|
||||||
|
"sub_category_id": t.sub_category_id,
|
||||||
|
# XXX all the bitflags are numbers
|
||||||
|
"anonymous": bool(t.anonymous),
|
||||||
|
"trusted": bool(t.trusted),
|
||||||
|
"remake": bool(t.remake),
|
||||||
|
"complete": bool(t.complete),
|
||||||
|
# TODO instead of indexing and filtering later
|
||||||
|
# could delete from es entirely. Probably won't matter
|
||||||
|
# for at least a few months.
|
||||||
|
"hidden": bool(t.hidden),
|
||||||
|
"deleted": bool(t.deleted),
|
||||||
|
"has_torrent": t.has_torrent,
|
||||||
|
# XXX last_updated isn't initialized
|
||||||
|
"stats_last_updated": t.stats.last_updated or t.created_time,
|
||||||
|
"download_count": t.stats.download_count,
|
||||||
|
"leech_count": t.stats.leech_count,
|
||||||
|
"seed_count": t.stats.seed_count,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# page through an sqlalchemy query, like the per_fetch but
|
||||||
|
# doesn't break the eager joins its doing against the stats table.
|
||||||
|
# annoying that this isn't built in somehow.
|
||||||
|
def page_query(query, limit=sys.maxsize, batch_size=10000):
|
||||||
|
start = 0
|
||||||
|
while True:
|
||||||
|
# XXX very inelegant way to do this, i'm confus
|
||||||
|
stop = min(limit, start + batch_size)
|
||||||
|
if stop == start:
|
||||||
|
break
|
||||||
|
things = query.slice(start, stop)
|
||||||
|
if not things:
|
||||||
|
break
|
||||||
|
had_things = False
|
||||||
|
for thing in things:
|
||||||
|
had_things = True
|
||||||
|
yield(thing)
|
||||||
|
if not had_things or stop == limit:
|
||||||
|
break
|
||||||
|
bar.update(start)
|
||||||
|
start = min(limit, start + batch_size)
|
||||||
|
|
||||||
|
helpers.bulk(es, (mk_es(t) for t in page_query(Torrent.query)), chunk_size=10000)
|
|
@ -27,6 +27,11 @@ from email.mime.multipart import MIMEMultipart
|
||||||
from email.mime.text import MIMEText
|
from email.mime.text import MIMEText
|
||||||
from email.utils import formatdate
|
from email.utils import formatdate
|
||||||
|
|
||||||
|
from elasticsearch import Elasticsearch
|
||||||
|
from elasticsearch_dsl import Search, Q
|
||||||
|
|
||||||
|
es_client = Elasticsearch()
|
||||||
|
|
||||||
DEBUG_API = False
|
DEBUG_API = False
|
||||||
|
|
||||||
|
|
||||||
|
@ -67,6 +72,16 @@ def search(term='', user=None, sort='id', order='desc', category='0_0', quality_
|
||||||
sort_ = sort.lower()
|
sort_ = sort.lower()
|
||||||
if sort_ not in sort_keys:
|
if sort_ not in sort_keys:
|
||||||
flask.abort(400)
|
flask.abort(400)
|
||||||
|
|
||||||
|
# XXX gross why are all the names subtly different
|
||||||
|
es_sort = ({
|
||||||
|
'id': 'id',
|
||||||
|
'size': 'filesize',
|
||||||
|
'name': 'display_name',
|
||||||
|
'seeders': 'seed_count',
|
||||||
|
'leechers': 'leech_count',
|
||||||
|
'downloads': 'download_count'
|
||||||
|
})[sort]
|
||||||
sort = sort_keys[sort]
|
sort = sort_keys[sort]
|
||||||
|
|
||||||
order_keys = {
|
order_keys = {
|
||||||
|
@ -78,6 +93,10 @@ def search(term='', user=None, sort='id', order='desc', category='0_0', quality_
|
||||||
if order_ not in order_keys:
|
if order_ not in order_keys:
|
||||||
flask.abort(400)
|
flask.abort(400)
|
||||||
|
|
||||||
|
# funky, es sort is default asc, prefixed by '-' if desc
|
||||||
|
if "desc" == order:
|
||||||
|
es_sort = "-" + es_sort
|
||||||
|
|
||||||
filter_keys = {
|
filter_keys = {
|
||||||
'0': None,
|
'0': None,
|
||||||
'1': (models.TorrentFlags.REMAKE, False),
|
'1': (models.TorrentFlags.REMAKE, False),
|
||||||
|
@ -126,28 +145,37 @@ def search(term='', user=None, sort='id', order='desc', category='0_0', quality_
|
||||||
if flask.g.user:
|
if flask.g.user:
|
||||||
same_user = flask.g.user.id == user
|
same_user = flask.g.user.id == user
|
||||||
|
|
||||||
|
s = Search(using=es_client, index='nyaav2')
|
||||||
if term:
|
if term:
|
||||||
query = db.session.query(models.TorrentNameSearch)
|
query = db.session.query(models.TorrentNameSearch)
|
||||||
|
s = s.query("query_string", default_field="display_name", default_operator="AND", query=term)
|
||||||
else:
|
else:
|
||||||
query = models.Torrent.query
|
query = models.Torrent.query
|
||||||
|
|
||||||
# Filter by user
|
# Filter by user
|
||||||
if user:
|
if user:
|
||||||
|
s = s.filter("term", uploader_id=user)
|
||||||
|
|
||||||
query = query.filter(models.Torrent.uploader_id == user)
|
query = query.filter(models.Torrent.uploader_id == user)
|
||||||
# If admin, show everything
|
# If admin, show everything
|
||||||
if not admin:
|
if not admin:
|
||||||
# If user is not logged in or the accessed feed doesn't belong to user,
|
# If user is not logged in or the accessed feed doesn't belong to user,
|
||||||
# hide anonymous torrents belonging to the queried user
|
# hide anonymous torrents belonging to the queried user
|
||||||
if not same_user:
|
if not same_user:
|
||||||
|
# TODO adapt to es syntax
|
||||||
query = query.filter(models.Torrent.flags.op('&')(
|
query = query.filter(models.Torrent.flags.op('&')(
|
||||||
int(models.TorrentFlags.ANONYMOUS | models.TorrentFlags.DELETED)).is_(False))
|
int(models.TorrentFlags.ANONYMOUS | models.TorrentFlags.DELETED)).is_(False))
|
||||||
|
|
||||||
if main_category:
|
if main_category:
|
||||||
|
s = s.filter("term", main_category_id=main_cat_id)
|
||||||
query = query.filter(models.Torrent.main_category_id == main_cat_id)
|
query = query.filter(models.Torrent.main_category_id == main_cat_id)
|
||||||
elif sub_category:
|
elif sub_category:
|
||||||
|
s = s.filter("term", main_category_id=main_cat_id)
|
||||||
|
s = s.filter("term", sub_category_id=sub_cat_id)
|
||||||
query = query.filter((models.Torrent.main_category_id == main_cat_id) &
|
query = query.filter((models.Torrent.main_category_id == main_cat_id) &
|
||||||
(models.Torrent.sub_category_id == sub_cat_id))
|
(models.Torrent.sub_category_id == sub_cat_id))
|
||||||
|
|
||||||
|
# TODO i dunno what this means in es
|
||||||
if filter_tuple:
|
if filter_tuple:
|
||||||
query = query.filter(models.Torrent.flags.op('&')(int(filter_tuple[0])).is_(filter_tuple[1]))
|
query = query.filter(models.Torrent.flags.op('&')(int(filter_tuple[0])).is_(filter_tuple[1]))
|
||||||
|
|
||||||
|
@ -157,6 +185,7 @@ def search(term='', user=None, sort='id', order='desc', category='0_0', quality_
|
||||||
int(models.TorrentFlags.HIDDEN | models.TorrentFlags.DELETED)).is_(False))
|
int(models.TorrentFlags.HIDDEN | models.TorrentFlags.DELETED)).is_(False))
|
||||||
|
|
||||||
if term:
|
if term:
|
||||||
|
# note already handled in es
|
||||||
for item in shlex.split(term, posix=False):
|
for item in shlex.split(term, posix=False):
|
||||||
if len(item) >= 2:
|
if len(item) >= 2:
|
||||||
query = query.filter(FullTextSearch(
|
query = query.filter(FullTextSearch(
|
||||||
|
@ -166,14 +195,25 @@ def search(term='', user=None, sort='id', order='desc', category='0_0', quality_
|
||||||
if sort.class_ != models.Torrent:
|
if sort.class_ != models.Torrent:
|
||||||
query = query.join(sort.class_)
|
query = query.join(sort.class_)
|
||||||
|
|
||||||
|
s = s.sort(es_sort)
|
||||||
query = query.order_by(getattr(sort, order)())
|
query = query.order_by(getattr(sort, order)())
|
||||||
|
|
||||||
|
per = app.config['RESULTS_PER_PAGE']
|
||||||
if rss:
|
if rss:
|
||||||
query = query.limit(app.config['RESULTS_PER_PAGE'])
|
pass
|
||||||
|
#query = query.limit(app.config['RESULTS_PER_PAGE'])
|
||||||
else:
|
else:
|
||||||
query = query.paginate_faste(page, per_page=app.config['RESULTS_PER_PAGE'], step=5)
|
# page is 1-based?
|
||||||
|
s = s[(page-1)*per:page*per]
|
||||||
|
#query = query.paginate_faste(page, per_page=app.config['RESULTS_PER_PAGE'], step=5)
|
||||||
|
|
||||||
return query
|
s = s.highlight_options(tags_schema='styled')
|
||||||
|
s = s.highlight("display_name")
|
||||||
|
|
||||||
|
#return query
|
||||||
|
from pprint import pprint
|
||||||
|
print(json.dumps(s.to_dict()))
|
||||||
|
return s.execute()
|
||||||
|
|
||||||
|
|
||||||
@app.errorhandler(404)
|
@app.errorhandler(404)
|
||||||
|
@ -445,6 +485,7 @@ def activate_user(payload):
|
||||||
|
|
||||||
user.status = models.UserStatusType.ACTIVE
|
user.status = models.UserStatusType.ACTIVE
|
||||||
|
|
||||||
|
|
||||||
db.session.add(user)
|
db.session.add(user)
|
||||||
db.session.commit()
|
db.session.commit()
|
||||||
|
|
||||||
|
|
|
@ -97,4 +97,14 @@ table.torrent-list thead th.sorting_desc:after {
|
||||||
margin-left: 20px;
|
margin-left: 20px;
|
||||||
margin-bottom: 10px;
|
margin-bottom: 10px;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* elasticsearch term highlight */
|
||||||
|
.hlt1 {
|
||||||
|
font-style: normal;
|
||||||
|
display: inline-block;
|
||||||
|
padding: 0 3px;
|
||||||
|
border-radius: 3px;
|
||||||
|
border: 1px solid rgba(100, 56, 0, 0.8);
|
||||||
|
background: rgba(200,127,0,0.3);
|
||||||
|
}
|
||||||
|
|
|
@ -8,7 +8,7 @@
|
||||||
{{ caller() }}
|
{{ caller() }}
|
||||||
</th>
|
</th>
|
||||||
{% endmacro %}
|
{% endmacro %}
|
||||||
{% if torrent_query.items %}
|
{% if torrent_query.hits.total > 0 %}
|
||||||
<div class="table-responsive">
|
<div class="table-responsive">
|
||||||
<table class="table table-bordered table-hover table-striped torrent-list">
|
<table class="table table-bordered table-hover table-striped torrent-list">
|
||||||
<thead>
|
<thead>
|
||||||
|
@ -45,26 +45,26 @@
|
||||||
</tr>
|
</tr>
|
||||||
</thead>
|
</thead>
|
||||||
<tbody>
|
<tbody>
|
||||||
{% for torrent in torrent_query.items %}
|
{% for torrent in torrent_query %}
|
||||||
<tr class="{% if torrent.deleted %}deleted{% elif torrent.hidden %}warning{% elif torrent.remake %}danger{% elif torrent.trusted %}success{% else %}default{% endif %}">
|
<tr class="{% if torrent.deleted %}deleted{% elif torrent.hidden %}warning{% elif torrent.remake %}danger{% elif torrent.trusted %}success{% else %}default{% endif %}">
|
||||||
{% set cat_id = (torrent.main_category.id|string) + '_' + (torrent.sub_category.id|string) %}
|
{% set cat_id = (torrent.main_category_id|string) + '_' + (torrent.sub_category_id|string) %}
|
||||||
{% set icon_dir = config.SITE_FLAVOR %}
|
{% set icon_dir = config.SITE_FLAVOR %}
|
||||||
<td style="padding:0 4px;">
|
<td style="padding:0 4px;">
|
||||||
<a href="/?c={{ cat_id }}" title="{{ torrent.main_category.name }} - {{ torrent.sub_category.name }}">
|
<a href="/?c={{ cat_id }}" title="{{ torrent.main_category_id }} - {{ torrent.sub_category_id }}">
|
||||||
<img src="/static/img/icons/{{ icon_dir }}/{{ cat_id }}.png">
|
<img src="/static/img/icons/{{ icon_dir }}/{{ cat_id }}.png">
|
||||||
</a>
|
</a>
|
||||||
</td>
|
</td>
|
||||||
<td><a href="{{ url_for('view_torrent', torrent_id=torrent.id) }}">{{ torrent.display_name | escape }}</a></td>
|
<td><a href="{{ url_for('view_torrent', torrent_id=torrent.meta.id) }}">{{ torrent.meta.highlight.display_name[0] | safe }}</a></td>
|
||||||
<td style="white-space: nowrap;text-align: center;">
|
<td style="white-space: nowrap;text-align: center;">
|
||||||
{% if torrent.has_torrent %}<a href="{{ url_for('download_torrent', torrent_id=torrent.id) }}"><i class="fa fa-fw fa-download"></i></a>{% endif %}
|
{% if torrent.has_torrent %}<a href="{{ url_for('download_torrent', torrent_id=torrent.id) }}"><i class="fa fa-fw fa-download"></i></a>{% endif %}
|
||||||
<a href="{{ torrent.magnet_uri }}"><i class="fa fa-fw fa-magnet"></i></a>
|
<a href="{{ torrent.magnet_uri }}"><i class="fa fa-fw fa-magnet"></i></a>
|
||||||
</td>
|
</td>
|
||||||
<td class="text-center">{{ torrent.filesize | filesizeformat(True) }}</td>
|
<td class="text-center">{{ torrent.filesize | filesizeformat(True) }}</td>
|
||||||
<td class="text-center" data-timestamp="{{ torrent.created_utc_timestamp|int }}">{{ torrent.created_time.strftime('%Y-%m-%d %H:%M') }}</td>
|
<td class="text-center" {#data-timestamp="{{ torrent.created_time|int }}"#}>{{ torrent.created_time }}</td>
|
||||||
{% if config.ENABLE_SHOW_STATS %}
|
{% if config.ENABLE_SHOW_STATS %}
|
||||||
<td class="text-center" style="color: green;">{{ torrent.stats.seed_count }}</td>
|
<td class="text-center" style="color: green;">{{ torrent.seed_count }}</td>
|
||||||
<td class="text-center" style="color: red;">{{ torrent.stats.leech_count }}</td>
|
<td class="text-center" style="color: red;">{{ torrent.leech_count }}</td>
|
||||||
<td class="text-center">{{ torrent.stats.download_count }}</td>
|
<td class="text-center">{{ torrent.download_count }}</td>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
</tr>
|
</tr>
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
|
@ -75,7 +75,9 @@
|
||||||
<h3>No results found</h3>
|
<h3>No results found</h3>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
|
||||||
|
{#
|
||||||
<center>
|
<center>
|
||||||
{% from "bootstrap/pagination.html" import render_pagination %}
|
{% from "bootstrap/pagination.html" import render_pagination %}
|
||||||
{{ render_pagination(torrent_query) }}
|
{{ render_pagination(torrent_query) }}
|
||||||
</center>
|
</center>
|
||||||
|
#}
|
||||||
|
|
|
@ -24,11 +24,33 @@ pycodestyle==2.3.1
|
||||||
pycparser==2.17
|
pycparser==2.17
|
||||||
pyparsing==2.2.0
|
pyparsing==2.2.0
|
||||||
six==1.10.0
|
six==1.10.0
|
||||||
SQLAlchemy>=1.1.9
|
SQLAlchemy==1.1.9
|
||||||
SQLAlchemy-FullText-Search==0.2.3
|
SQLAlchemy-FullText-Search==0.2.3
|
||||||
SQLAlchemy-Utils>=0.32.14
|
SQLAlchemy-Utils==0.32.14
|
||||||
uWSGI==2.0.15
|
uWSGI==2.0.15
|
||||||
visitor==0.1.3
|
visitor==0.1.3
|
||||||
webassets==0.12.1
|
webassets==0.12.1
|
||||||
Werkzeug==0.12.1
|
Werkzeug==0.12.1
|
||||||
WTForms==2.1
|
WTForms==2.1
|
||||||
|
## The following requirements were added by pip freeze:
|
||||||
|
decorator==4.0.11
|
||||||
|
elasticsearch==5.3.0
|
||||||
|
elasticsearch-dsl==5.2.0
|
||||||
|
ipython==6.0.0
|
||||||
|
ipython-genutils==0.2.0
|
||||||
|
jedi==0.10.2
|
||||||
|
mysql-replication==0.13
|
||||||
|
pexpect==4.2.1
|
||||||
|
pickleshare==0.7.4
|
||||||
|
pkg-resources==0.0.0
|
||||||
|
progressbar2==3.20.0
|
||||||
|
prompt-toolkit==1.0.14
|
||||||
|
ptyprocess==0.5.1
|
||||||
|
Pygments==2.2.0
|
||||||
|
PyMySQL==0.7.11
|
||||||
|
python-dateutil==2.6.0
|
||||||
|
python-utils==2.1.0
|
||||||
|
simplegeneric==0.8.1
|
||||||
|
traitlets==4.3.2
|
||||||
|
urllib3==1.21.1
|
||||||
|
wcwidth==0.1.7
|
||||||
|
|
Loading…
Reference in a new issue