1
0
Fork 0
mirror of https://gitlab.com/SIGBUS/nyaa.git synced 2025-01-25 00:25:12 +00:00

Pad info_hash in ElasticSearch sync scripts

python-mysql-replication (or PyMySQL) would return less than 20 bytes
for info-hashes that had null bytes near the end, leaving incomplete
hashes in the ES index. Without delving too deep into the real issue
(be it lack of understanding MySQL storing binary data or a bug in
the libraries), thankfully we can just pad the fixed-size info-hashes
to be 20 bytes.

Padding in import_to_es.py may be erring on the side of caution, but
safe is established to be better than sorry.

(SQLAlchemy is unaffected by this bug)

Fixes #456
This commit is contained in:
TheAMM 2018-02-25 15:12:35 +02:00
parent 0b98b2454a
commit 81806d7bc9
2 changed files with 8 additions and 2 deletions

View file

@ -21,6 +21,9 @@ app = create_app('config')
es = Elasticsearch(timeout=30)
ic = IndicesClient(es)
def pad_bytes(in_bytes, size):
return in_bytes + (b'\x00' * max(0, size - len(in_bytes)))
# turn into thing that elasticsearch indexes. We flatten in
# the stats (seeders/leechers) so we can order by them in es naturally.
# we _don't_ dereference uploader_id to the user's display name however,
@ -42,7 +45,7 @@ def mk_es(t, index_name):
"created_time": t.created_time,
# not analyzed but included so we can render magnet links
# without querying sql again.
"info_hash": t.info_hash.hex(),
"info_hash": pad_bytes(t.info_hash, 20).hex(),
"filesize": t.filesize,
"uploader_id": t.uploader_id,
"main_category_id": t.main_category_id,

View file

@ -73,6 +73,9 @@ ES_CHUNK_SIZE = config.get('es_chunk_size', 10000)
# interacts with es' refresh_interval setting.
FLUSH_INTERVAL = config.get('flush_interval', 5)
def pad_bytes(in_bytes, size):
return in_bytes + (b'\x00' * max(0, size - len(in_bytes)))
def reindex_torrent(t, index_name):
# XXX annoyingly different from import_to_es, and
# you need to keep them in sync manually.
@ -85,7 +88,7 @@ def reindex_torrent(t, index_name):
"description": t['description'],
# not analyzed but included so we can render magnet links
# without querying sql again.
"info_hash": t['info_hash'].hex(),
"info_hash": pad_bytes(t['info_hash'], 20).hex(),
"filesize": t['filesize'],
"uploader_id": t['uploader_id'],
"main_category_id": t['main_category_id'],