sync_es: move io to separate threads, config json

throughput is definitely massively improved, testing locally. hopefully it'll be enough. config moved a separate file by ops request. lazy lazy
2025-04-17 22:43:47 +00:00 · 2017-05-21 00:55:19 -06:00 · 2017-05-21 00:55:19 -06:00 · ea2160a49d
parent 6a4ad827c1
commit ea2160a49d
2 changed files with 201 additions and 100 deletions
--- a/config_es_sync.json
+++ b/config_es_sync.json
@ -0,0 +1,11 @@
 {
 "save_loc": "/tmp/pos.json",
 "mysql_host": "127.0.0.1",
 "mysql_port": 13306,
 "mysql_user": "root",
 "mysql_password": "dunnolol",
 "database": "nyaav2",
 "internal_queue_depth": 10000,
 "es_chunk_size": 10000,
 "flush_interval": 5
 }
--- a/sync_es.py
+++ b/sync_es.py
@ -21,9 +21,12 @@ when import_to_es and sync_es will be lost. Instead, you can run SHOW MASTER
 STATUS _before_ you run import_to_es. That way you'll definitely pick up any
 changes that happen while the import_to_es script is dumping stuff from the
 database into es, at the expense of redoing a (small) amount of indexing.
 This uses multithreading so we don't have to block on socket io (both binlog
 reading and es POSTing). asyncio soon™
 """
 from elasticsearch import Elasticsearch
-from elasticsearch.helpers import bulk
+from elasticsearch.helpers import bulk, BulkIndexError
 from pymysqlreplication import BinLogStreamReader
 from pymysqlreplication.row_event import UpdateRowsEvent, DeleteRowsEvent, WriteRowsEvent
 from datetime import datetime
@ -33,54 +36,38 @@ import json
 import time
 import logging
 from statsd import StatsClient
 from threading import Thread
 from queue import Queue, Empty
 logging.basicConfig(format='%(asctime)s %(levelname)s %(name)s - %(message)s')
 log = logging.getLogger('sync_es')
 log.setLevel(logging.INFO)
 # config in json, 2lazy to argparse
 if len(sys.argv) != 2:
    print("need config.json location", file=sys.stderr)
    sys.exit(-1)
 with open(sys.argv[1]) as f:
    config = json.load(f)
 # goes to netdata or other statsd listener
 stats = StatsClient('localhost', 8125, prefix="sync_es")
 #logging.getLogger('elasticsearch').setLevel(logging.DEBUG)
 # in prod want in /var/lib somewhere probably
-SAVE_LOC = "/var/lib/sync_es_position.json"
+SAVE_LOC = config.get('save_loc', "/tmp/pos.json")
-MYSQL_HOST = '127.0.0.1'
+MYSQL_HOST = config.get('mysql_host', '127.0.0.1')
-MYSQL_PORT = 3306
+MYSQL_PORT = config.get('mysql_port', 3306)
-MYSQL_USER = 'test'
+MYSQL_USER = config.get('mysql_user', 'root')
-MYSQL_PW = 'test123'
+MYSQL_PW = config.get('mysql_password', 'dunnolol')
-NT_DB = 'nyaav2'
+NT_DB = config.get('database', 'nyaav2')
-
+INTERNAL_QUEUE_DEPTH = config.get('internal_queue_depth', 10000)
-with open(SAVE_LOC) as f:
+ES_CHUNK_SIZE = config.get('es_chunk_size', 10000)
-    pos = json.load(f)
+# seconds since no events happening to flush to es. remember this also
-
+# interacts with es' refresh_interval setting.
-es = Elasticsearch(timeout=30)
+FLUSH_INTERVAL = config.get('flush_interval', 5)
 stream = BinLogStreamReader(
        # TODO parse out from config.py or something
        connection_settings = {
            'host': MYSQL_HOST,
            'port': MYSQL_PORT,
            'user': MYSQL_USER,
            'passwd': MYSQL_PW
        },
        server_id=10, # arbitrary
        # only care about this database currently
        only_schemas=[NT_DB],
        # these tables in the database
        only_tables=["nyaa_torrents", "nyaa_statistics", "sukebei_torrents", "sukebei_statistics"],
        # from our save file
        resume_stream=True,
        log_file=pos['log_file'],
        log_pos=pos['log_pos'],
        # skip the other stuff like table mapping
        only_events=[UpdateRowsEvent, DeleteRowsEvent, WriteRowsEvent],
        # if we're at the head of the log, block until something happens
        # note it'd be nice to block async-style instead, but the mainline
        # binlogreader is synchronous. there is an (unmaintained?) fork
        # using aiomysql if anybody wants to revive that.
        blocking=True)
 def reindex_torrent(t, index_name):
    # XXX annoyingly different from import_to_es, and
@ -145,73 +132,176 @@ def delet_this(row, index_name):
        '_type': 'torrent',
        '_id': str(row['values']['id'])}
 last_save = time.time()
 since_last = 0
-# get POST latency
+class BinlogReader(Thread):
-@stats.timer('post_bulk')
+    # write_buf is the Queue we communicate with
-def timed_bulk(events, **kwargs):
+    def __init__(self, write_buf):
-    bulk(es, events, **kwargs)
+        Thread.__init__(self)
        self.write_buf = write_buf
-log.info(f"reading binlog from {stream.log_file}/{stream.log_pos}")
+    def run(self):
        with open(SAVE_LOC) as f:
            pos = json.load(f)
-for event in stream:
+        stream = BinLogStreamReader(
-    with stats.pipeline() as s:
+                # TODO parse out from config.py or something
-        s.incr('total_events')
+                connection_settings = {
-        s.incr(f"event.{event.table}.{type(event).__name__}")
+                    'host': MYSQL_HOST,
-        s.incr('total_rows', len(event.rows))
+                    'port': MYSQL_PORT,
-        s.incr(f"rows.{event.table}.{type(event).__name__}", len(event.rows))
+                    'user': MYSQL_USER,
-        # XXX not a "timer", but we get a histogram out of it
+                    'passwd': MYSQL_PW
-        s.timing(f"rows_per_event.{event.table}.{type(event).__name__}", len(event.rows))
+                },
-    if event.table == "nyaa_torrents" or event.table == "sukebei_torrents":
+                server_id=10, # arbitrary
-        if event.table == "nyaa_torrents":
+                # only care about this database currently
-            index_name = "nyaa"
+                only_schemas=[NT_DB],
-        else:
+                # these tables in the database
-            index_name = "sukebei"
+                only_tables=["nyaa_torrents", "nyaa_statistics", "sukebei_torrents", "sukebei_statistics"],
-        if type(event) is WriteRowsEvent:
+                # from our save file
-            timed_bulk(reindex_torrent(row['values'], index_name) for row in event.rows)
+                resume_stream=True,
-        elif type(event) is UpdateRowsEvent:
+                log_file=pos['log_file'],
-            # UpdateRowsEvent includes the old values too, but we don't care
+                log_pos=pos['log_pos'],
-            timed_bulk(reindex_torrent(row['after_values'], index_name) for row in event.rows)
+                # skip the other stuff like table mapping
-        elif type(event) is DeleteRowsEvent:
+                only_events=[UpdateRowsEvent, DeleteRowsEvent, WriteRowsEvent],
-            # ok, bye
+                # if we're at the head of the log, block until something happens
-            timed_bulk(delet_this(row, index_name) for row in event.rows)
+                # note it'd be nice to block async-style instead, but the mainline
-        else:
+                # binlogreader is synchronous. there is an (unmaintained?) fork
-            raise Exception(f"unknown event {type(event)}")
+                # using aiomysql if anybody wants to revive that.
-    elif event.table == "nyaa_statistics" or event.table == "sukebei_statistics":
+                blocking=True)
        if event.table == "nyaa_statistics":
            index_name = "nyaa"
        else:
            index_name = "sukebei"
        if type(event) is WriteRowsEvent:
            timed_bulk(
                (reindex_stats(row['values'], index_name) for row in event.rows),
                # statistics updates are pretty low priority, plus in certain
                # cases where we're really out of sync, the torrent could be missing,
                # causing a "document missing" error from es, with no way to suppress
                # that server-side. Thus ignore all errors, oh well.
                raise_on_error=False, stats_only=True)
        elif type(event) is UpdateRowsEvent:
            timed_bulk(
                (reindex_stats(row['after_values'], index_name) for row in event.rows),
                raise_on_error=False, stats_only=True)
        elif type(event) is DeleteRowsEvent:
            # uh ok. Assume that the torrent row will get deleted later,
            # which will clean up the entire es "torrent" document
            pass
        else:
            raise Exception(f"unknown event {type(event)}")
    else:
      raise Exception(f"unknown table {s.table}")
-    # how far we're behind, wall clock
+        log.info(f"reading binlog from {stream.log_file}/{stream.log_pos}")
-    stats.gauge('process_latency', int((time.time() - event.timestamp) * 1000))
+
        for event in stream:
            # save the pos of the stream and timestamp with each message, so we
            # can commit in the other thread. and keep track of process latency
            pos = (stream.log_file, stream.log_pos, event.timestamp)
            with stats.pipeline() as s:
                s.incr('total_events')
                s.incr(f"event.{event.table}.{type(event).__name__}")
                s.incr('total_rows', len(event.rows))
                s.incr(f"rows.{event.table}.{type(event).__name__}", len(event.rows))
                # XXX not a "timer", but we get a histogram out of it
                s.timing(f"rows_per_event.{event.table}.{type(event).__name__}", len(event.rows))
            if event.table == "nyaa_torrents" or event.table == "sukebei_torrents":
                if event.table == "nyaa_torrents":
                    index_name = "nyaa"
                else:
                    index_name = "sukebei"
                if type(event) is WriteRowsEvent:
                    for row in event.rows:
                        self.write_buf.put(
                                (pos, reindex_torrent(row['values'], index_name)),
                                block=True)
                elif type(event) is UpdateRowsEvent:
                    # UpdateRowsEvent includes the old values too, but we don't care
                    for row in event.rows:
                        self.write_buf.put(
                                (pos, reindex_torrent(row['after_values'], index_name)),
                                block=True)
                elif type(event) is DeleteRowsEvent:
                    # ok, bye
                    for row in event.rows:
                        self.write_buf.put((pos, delet_this(row, index_name)), block=True)
                else:
                    raise Exception(f"unknown event {type(event)}")
            elif event.table == "nyaa_statistics" or event.table == "sukebei_statistics":
                if event.table == "nyaa_statistics":
                    index_name = "nyaa"
                else:
                    index_name = "sukebei"
                if type(event) is WriteRowsEvent:
                    for row in event.rows:
                        self.write_buf.put(
                                (pos, reindex_stats(row['values'], index_name)),
                                block=True)
                elif type(event) is UpdateRowsEvent:
                    for row in event.rows:
                        self.write_buf.put(
                                (pos, reindex_stats(row['after_values'], index_name)),
                                block=True)
                elif type(event) is DeleteRowsEvent:
                    # uh ok. Assume that the torrent row will get deleted later,
                    # which will clean up the entire es "torrent" document
                    pass
                else:
                    raise Exception(f"unknown event {type(event)}")
            else:
              raise Exception(f"unknown table {s.table}")
 class EsPoster(Thread):
    # read_buf is the queue of stuff to bulk post
    def __init__(self, read_buf, chunk_size=1000, flush_interval=5):
        Thread.__init__(self)
        self.read_buf = read_buf
        self.chunk_size = chunk_size
        self.flush_interval = flush_interval
    def run(self):
        es = Elasticsearch(timeout=30)
    since_last += 1
    if since_last >= 10000 or (time.time() - last_save) > 10:
        log.info(f"saving position {stream.log_file}/{stream.log_pos}, {time.time() - event.timestamp:,.3} seconds behind")
        with stats.timer('save_pos'):
            with open(SAVE_LOC, 'w') as f:
                json.dump({"log_file": stream.log_file, "log_pos": stream.log_pos}, f)
        last_save = time.time()
        since_last = 0
        while True:
            actions = []
            while len(actions) < self.chunk_size:
                try:
                    # grab next event from queue with metadata that creepily
                    # updates, surviving outside the scope of the loop
                    ((log_file, log_pos, timestamp), action) = \
                            self.read_buf.get(block=True, timeout=self.flush_interval)
                    actions.append(action)
                except Empty:
                    # nothing new for the whole interval
                    break
            if not actions:
                # nothing to post
                log.debug("no changes...")
                continue
            # XXX "time" to get histogram of no events per bulk
            stats.timing('actions_per_bulk', len(actions))
            try:
                with stats.timer('post_bulk'):
                    bulk(es, actions, chunk_size=self.chunk_size)
            except BulkIndexError as bie:
                 # in certain cases where we're really out of sync, we update a
                 # stat when the torrent doc is, causing a "document missing"
                 # error from es, with no way to suppress that server-side.
                 # Thus ignore that type of error if it's the only problem
                for e in bie.errors:
                    try:
                        if e['update']['error']['type'] != 'document_missing_exception':
                            raise bie
                    except KeyError:
                        raise bie
            # how far we're behind, wall clock
            stats.gauge('process_latency', int((time.time() - timestamp) * 1000))
            since_last += len(actions)
            if since_last >= 10000 or (time.time() - last_save) > 10:
                log.info(f"saving position {log_file}/{log_pos}, {time.time() - timestamp:,.3f} seconds behind")
                with stats.timer('save_pos'):
                    with open(SAVE_LOC, 'w') as f:
                        json.dump({"log_file": log_file, "log_pos": log_pos}, f)
                last_save = time.time()
                since_last = 0
 # in-memory queue between binlog and es. The bigger it is, the more events we
 # can parse in memory while waiting for es to catch up, at the expense of heap.
 buf = Queue(maxsize=INTERNAL_QUEUE_DEPTH)
 reader = BinlogReader(buf)
 reader.daemon = True
 writer = EsPoster(buf, chunk_size=ES_CHUNK_SIZE, flush_interval=FLUSH_INTERVAL)
 writer.daemon = True
 reader.start()
 writer.start()
 # on the main thread, poll the queue size for monitoring
 while True:
    stats.gauge('queue_depth', buf.qsize())
    time.sleep(1)