2017-05-14 06:48:17 +00:00
|
|
|
#!/usr/bin/env python
|
|
|
|
"""
|
|
|
|
Bulk load torents from mysql into elasticsearch `nyaav2` index,
|
|
|
|
which is assumed to already exist.
|
|
|
|
This is a one-shot deal, so you'd either need to complement it
|
|
|
|
with a cron job or some binlog-reading thing (TODO)
|
|
|
|
"""
|
2017-05-27 23:12:48 +00:00
|
|
|
import sys
|
|
|
|
import json
|
|
|
|
|
2017-07-30 17:35:16 +00:00
|
|
|
# This should be progressbar33
|
|
|
|
import progressbar
|
2017-05-14 06:48:17 +00:00
|
|
|
from elasticsearch import Elasticsearch
|
2017-05-17 05:00:58 +00:00
|
|
|
from elasticsearch.client import IndicesClient
|
2017-05-14 06:48:17 +00:00
|
|
|
from elasticsearch import helpers
|
|
|
|
|
2017-08-01 18:02:08 +00:00
|
|
|
from nyaa import create_app, models
|
2017-07-30 17:35:16 +00:00
|
|
|
from nyaa.extensions import db
|
2017-05-14 06:48:17 +00:00
|
|
|
|
2017-08-01 18:02:08 +00:00
|
|
|
app = create_app('config')
|
2017-05-17 06:15:48 +00:00
|
|
|
es = Elasticsearch(timeout=30)
|
2017-05-17 05:00:58 +00:00
|
|
|
ic = IndicesClient(es)
|
2017-05-14 06:48:17 +00:00
|
|
|
|
2018-02-25 13:12:35 +00:00
|
|
|
def pad_bytes(in_bytes, size):
|
|
|
|
return in_bytes + (b'\x00' * max(0, size - len(in_bytes)))
|
|
|
|
|
2017-05-14 06:48:17 +00:00
|
|
|
# turn into thing that elasticsearch indexes. We flatten in
|
|
|
|
# the stats (seeders/leechers) so we can order by them in es naturally.
|
|
|
|
# we _don't_ dereference uploader_id to the user's display name however,
|
|
|
|
# instead doing that at query time. I _think_ this is right because
|
|
|
|
# we don't want to reindex all the user's torrents just because they
|
|
|
|
# changed their name, and we don't really want to FTS search on the user anyway.
|
|
|
|
# Maybe it's more convenient to derefence though.
|
2017-05-27 23:12:48 +00:00
|
|
|
def mk_es(t, index_name):
|
2017-05-14 06:48:17 +00:00
|
|
|
return {
|
|
|
|
"_id": t.id,
|
|
|
|
"_type": "torrent",
|
2017-05-27 23:12:48 +00:00
|
|
|
"_index": index_name,
|
2017-05-14 06:48:17 +00:00
|
|
|
"_source": {
|
2017-05-14 08:01:26 +00:00
|
|
|
# we're also indexing the id as a number so you can
|
|
|
|
# order by it. seems like this is just equivalent to
|
|
|
|
# order by created_time, but oh well
|
|
|
|
"id": t.id,
|
2017-05-14 06:48:17 +00:00
|
|
|
"display_name": t.display_name,
|
|
|
|
"created_time": t.created_time,
|
|
|
|
# not analyzed but included so we can render magnet links
|
|
|
|
# without querying sql again.
|
2018-02-25 13:12:35 +00:00
|
|
|
"info_hash": pad_bytes(t.info_hash, 20).hex(),
|
2017-05-14 06:48:17 +00:00
|
|
|
"filesize": t.filesize,
|
|
|
|
"uploader_id": t.uploader_id,
|
|
|
|
"main_category_id": t.main_category_id,
|
|
|
|
"sub_category_id": t.sub_category_id,
|
2017-05-26 13:12:47 +00:00
|
|
|
"comment_count": t.comment_count,
|
2017-05-14 06:48:17 +00:00
|
|
|
# XXX all the bitflags are numbers
|
|
|
|
"anonymous": bool(t.anonymous),
|
|
|
|
"trusted": bool(t.trusted),
|
|
|
|
"remake": bool(t.remake),
|
|
|
|
"complete": bool(t.complete),
|
|
|
|
# TODO instead of indexing and filtering later
|
|
|
|
# could delete from es entirely. Probably won't matter
|
|
|
|
# for at least a few months.
|
|
|
|
"hidden": bool(t.hidden),
|
|
|
|
"deleted": bool(t.deleted),
|
|
|
|
"has_torrent": t.has_torrent,
|
2017-05-15 18:14:01 +00:00
|
|
|
# Stats
|
2017-05-14 06:48:17 +00:00
|
|
|
"download_count": t.stats.download_count,
|
|
|
|
"leech_count": t.stats.leech_count,
|
|
|
|
"seed_count": t.stats.seed_count,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
# page through an sqlalchemy query, like the per_fetch but
|
|
|
|
# doesn't break the eager joins its doing against the stats table.
|
|
|
|
# annoying that this isn't built in somehow.
|
2017-05-27 23:12:48 +00:00
|
|
|
def page_query(query, limit=sys.maxsize, batch_size=10000, progress_bar=None):
|
2017-05-14 06:48:17 +00:00
|
|
|
start = 0
|
|
|
|
while True:
|
|
|
|
# XXX very inelegant way to do this, i'm confus
|
|
|
|
stop = min(limit, start + batch_size)
|
|
|
|
if stop == start:
|
|
|
|
break
|
|
|
|
things = query.slice(start, stop)
|
|
|
|
if not things:
|
|
|
|
break
|
|
|
|
had_things = False
|
|
|
|
for thing in things:
|
|
|
|
had_things = True
|
|
|
|
yield(thing)
|
|
|
|
if not had_things or stop == limit:
|
|
|
|
break
|
2017-05-27 23:12:48 +00:00
|
|
|
if progress_bar:
|
|
|
|
progress_bar.update(start)
|
2017-05-14 06:48:17 +00:00
|
|
|
start = min(limit, start + batch_size)
|
|
|
|
|
2017-05-27 23:12:48 +00:00
|
|
|
FLAVORS = [
|
|
|
|
('nyaa', models.NyaaTorrent),
|
|
|
|
('sukebei', models.SukebeiTorrent)
|
|
|
|
]
|
|
|
|
|
|
|
|
# Get binlog status from mysql
|
2017-07-30 17:35:16 +00:00
|
|
|
with app.app_context():
|
|
|
|
master_status = db.engine.execute('SHOW MASTER STATUS;').fetchone()
|
2017-05-27 23:12:48 +00:00
|
|
|
|
2017-09-17 00:54:09 +00:00
|
|
|
position_json = {
|
|
|
|
'log_file': master_status[0],
|
|
|
|
'log_pos': master_status[1]
|
|
|
|
}
|
2017-05-27 23:12:48 +00:00
|
|
|
|
2017-09-17 00:54:09 +00:00
|
|
|
print('Save the following in the file configured in your ES sync config JSON:')
|
|
|
|
print(json.dumps(position_json))
|
2017-05-27 23:12:48 +00:00
|
|
|
|
2017-09-17 00:54:09 +00:00
|
|
|
for flavor, torrent_class in FLAVORS:
|
|
|
|
print('Importing torrents for index', flavor, 'from', torrent_class)
|
|
|
|
bar = progressbar.ProgressBar(
|
|
|
|
maxval=torrent_class.query.count(),
|
|
|
|
widgets=[ progressbar.SimpleProgress(),
|
|
|
|
' [', progressbar.Timer(), '] ',
|
|
|
|
progressbar.Bar(),
|
|
|
|
' (', progressbar.ETA(), ') ',
|
|
|
|
])
|
2017-05-27 23:12:48 +00:00
|
|
|
|
2017-09-17 00:54:09 +00:00
|
|
|
# turn off refreshes while bulk loading
|
|
|
|
ic.put_settings(body={'index': {'refresh_interval': '-1'}}, index=flavor)
|
2017-05-27 23:12:48 +00:00
|
|
|
|
2017-09-17 00:54:09 +00:00
|
|
|
bar.start()
|
|
|
|
helpers.bulk(es, (mk_es(t, flavor) for t in page_query(torrent_class.query, progress_bar=bar)), chunk_size=10000)
|
|
|
|
bar.finish()
|
2017-05-17 05:00:58 +00:00
|
|
|
|
2017-09-17 00:54:09 +00:00
|
|
|
# Refresh the index immideately
|
|
|
|
ic.refresh(index=flavor)
|
|
|
|
print('Index refresh done.')
|
2017-05-17 05:00:58 +00:00
|
|
|
|
2017-09-17 00:54:09 +00:00
|
|
|
# restore to near-enough real time
|
|
|
|
ic.put_settings(body={'index': {'refresh_interval': '30s'}}, index=flavor)
|