mirror of
https://gitlab.com/SIGBUS/nyaa.git
synced 2025-01-25 00:25:12 +00:00
Pad info_hash in ElasticSearch sync scripts
python-mysql-replication (or PyMySQL) would return less than 20 bytes for info-hashes that had null bytes near the end, leaving incomplete hashes in the ES index. Without delving too deep into the real issue (be it lack of understanding MySQL storing binary data or a bug in the libraries), thankfully we can just pad the fixed-size info-hashes to be 20 bytes. Padding in import_to_es.py may be erring on the side of caution, but safe is established to be better than sorry. (SQLAlchemy is unaffected by this bug) Fixes #456
This commit is contained in:
parent
0b98b2454a
commit
81806d7bc9
|
@ -21,6 +21,9 @@ app = create_app('config')
|
|||
es = Elasticsearch(timeout=30)
|
||||
ic = IndicesClient(es)
|
||||
|
||||
def pad_bytes(in_bytes, size):
|
||||
return in_bytes + (b'\x00' * max(0, size - len(in_bytes)))
|
||||
|
||||
# turn into thing that elasticsearch indexes. We flatten in
|
||||
# the stats (seeders/leechers) so we can order by them in es naturally.
|
||||
# we _don't_ dereference uploader_id to the user's display name however,
|
||||
|
@ -42,7 +45,7 @@ def mk_es(t, index_name):
|
|||
"created_time": t.created_time,
|
||||
# not analyzed but included so we can render magnet links
|
||||
# without querying sql again.
|
||||
"info_hash": t.info_hash.hex(),
|
||||
"info_hash": pad_bytes(t.info_hash, 20).hex(),
|
||||
"filesize": t.filesize,
|
||||
"uploader_id": t.uploader_id,
|
||||
"main_category_id": t.main_category_id,
|
||||
|
|
|
@ -73,6 +73,9 @@ ES_CHUNK_SIZE = config.get('es_chunk_size', 10000)
|
|||
# interacts with es' refresh_interval setting.
|
||||
FLUSH_INTERVAL = config.get('flush_interval', 5)
|
||||
|
||||
def pad_bytes(in_bytes, size):
|
||||
return in_bytes + (b'\x00' * max(0, size - len(in_bytes)))
|
||||
|
||||
def reindex_torrent(t, index_name):
|
||||
# XXX annoyingly different from import_to_es, and
|
||||
# you need to keep them in sync manually.
|
||||
|
@ -85,7 +88,7 @@ def reindex_torrent(t, index_name):
|
|||
"description": t['description'],
|
||||
# not analyzed but included so we can render magnet links
|
||||
# without querying sql again.
|
||||
"info_hash": t['info_hash'].hex(),
|
||||
"info_hash": pad_bytes(t['info_hash'], 20).hex(),
|
||||
"filesize": t['filesize'],
|
||||
"uploader_id": t['uploader_id'],
|
||||
"main_category_id": t['main_category_id'],
|
||||
|
|
Loading…
Reference in a new issue