Pad info_hash in ElasticSearch sync scripts

python-mysql-replication (or PyMySQL) would return less than 20 bytes for info-hashes that had null bytes near the end, leaving incomplete hashes in the ES index. Without delving too deep into the real issue (be it lack of understanding MySQL storing binary data or a bug in the libraries), thankfully we can just pad the fixed-size info-hashes to be 20 bytes. Padding in import_to_es.py may be erring on the side of caution, but safe is established to be better than sorry. (SQLAlchemy is unaffected by this bug) Fixes #456
2025-04-22 22:22:54 +00:00 · 2018-02-25 15:12:35 +02:00 · 2018-02-25 15:12:35 +02:00 · 81806d7bc9
parent 0b98b2454a
commit 81806d7bc9
2 changed files with 8 additions and 2 deletions
--- a/import_to_es.py
+++ b/import_to_es.py
@ -21,6 +21,9 @@ app = create_app('config')
 es = Elasticsearch(timeout=30)
 ic = IndicesClient(es)

+def pad_bytes(in_bytes, size):
+    return in_bytes + (b'\x00' * max(0, size - len(in_bytes)))
+
 # turn into thing that elasticsearch indexes. We flatten in
 # the stats (seeders/leechers) so we can order by them in es naturally.
 # we _don't_ dereference uploader_id to the user's display name however,
@ -42,7 +45,7 @@ def mk_es(t, index_name):
            "created_time": t.created_time,
            # not analyzed but included so we can render magnet links
            # without querying sql again.
-            "info_hash": t.info_hash.hex(),
+            "info_hash": pad_bytes(t.info_hash, 20).hex(),
            "filesize": t.filesize,
            "uploader_id": t.uploader_id,
            "main_category_id": t.main_category_id,
--- a/sync_es.py
+++ b/sync_es.py
@ -73,6 +73,9 @@ ES_CHUNK_SIZE = config.get('es_chunk_size', 10000)
 # interacts with es' refresh_interval setting.
 FLUSH_INTERVAL = config.get('flush_interval', 5)

+def pad_bytes(in_bytes, size):
+    return in_bytes + (b'\x00' * max(0, size - len(in_bytes)))
+
 def reindex_torrent(t, index_name):
    # XXX annoyingly different from import_to_es, and
    # you need to keep them in sync manually.
@ -85,7 +88,7 @@ def reindex_torrent(t, index_name):
        "description": t['description'],
        # not analyzed but included so we can render magnet links
        # without querying sql again.
-        "info_hash": t['info_hash'].hex(),
+        "info_hash": pad_bytes(t['info_hash'], 20).hex(),
        "filesize": t['filesize'],
        "uploader_id": t['uploader_id'],
        "main_category_id": t['main_category_id'],