From 81806d7bc93fac87abeb27a3ee45eca4f3caa8f2 Mon Sep 17 00:00:00 2001 From: TheAMM Date: Sun, 25 Feb 2018 15:12:35 +0200 Subject: [PATCH] Pad info_hash in ElasticSearch sync scripts python-mysql-replication (or PyMySQL) would return less than 20 bytes for info-hashes that had null bytes near the end, leaving incomplete hashes in the ES index. Without delving too deep into the real issue (be it lack of understanding MySQL storing binary data or a bug in the libraries), thankfully we can just pad the fixed-size info-hashes to be 20 bytes. Padding in import_to_es.py may be erring on the side of caution, but safe is established to be better than sorry. (SQLAlchemy is unaffected by this bug) Fixes #456 --- import_to_es.py | 5 ++++- sync_es.py | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/import_to_es.py b/import_to_es.py index ee1ec72..c2ed46f 100755 --- a/import_to_es.py +++ b/import_to_es.py @@ -21,6 +21,9 @@ app = create_app('config') es = Elasticsearch(timeout=30) ic = IndicesClient(es) +def pad_bytes(in_bytes, size): + return in_bytes + (b'\x00' * max(0, size - len(in_bytes))) + # turn into thing that elasticsearch indexes. We flatten in # the stats (seeders/leechers) so we can order by them in es naturally. # we _don't_ dereference uploader_id to the user's display name however, @@ -42,7 +45,7 @@ def mk_es(t, index_name): "created_time": t.created_time, # not analyzed but included so we can render magnet links # without querying sql again. - "info_hash": t.info_hash.hex(), + "info_hash": pad_bytes(t.info_hash, 20).hex(), "filesize": t.filesize, "uploader_id": t.uploader_id, "main_category_id": t.main_category_id, diff --git a/sync_es.py b/sync_es.py index 444c905..c4a9025 100755 --- a/sync_es.py +++ b/sync_es.py @@ -73,6 +73,9 @@ ES_CHUNK_SIZE = config.get('es_chunk_size', 10000) # interacts with es' refresh_interval setting. FLUSH_INTERVAL = config.get('flush_interval', 5) +def pad_bytes(in_bytes, size): + return in_bytes + (b'\x00' * max(0, size - len(in_bytes))) + def reindex_torrent(t, index_name): # XXX annoyingly different from import_to_es, and # you need to keep them in sync manually. @@ -85,7 +88,7 @@ def reindex_torrent(t, index_name): "description": t['description'], # not analyzed but included so we can render magnet links # without querying sql again. - "info_hash": t['info_hash'].hex(), + "info_hash": pad_bytes(t['info_hash'], 20).hex(), "filesize": t['filesize'], "uploader_id": t['uploader_id'], "main_category_id": t['main_category_id'],