mirror of
https://github.com/yk/gpt-4chan-public.git
synced 2025-03-26 14:21:50 +00:00
60 lines
1.6 KiB
Python
Executable file
60 lines
1.6 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
|
|
import json
|
|
import bs4
|
|
from loguru import logger
|
|
import multiprocessing as mp
|
|
import tqdm
|
|
from absl import app, flags
|
|
|
|
import warnings
|
|
warnings.filterwarnings("ignore", category=bs4.MarkupResemblesLocatorWarning, module='bs4')
|
|
|
|
DATA_FN = '../tmp/pol_062016-112019_labeled.ndjson'
|
|
OUT_FN = '../tmp/kek.txt'
|
|
|
|
flags.DEFINE_string('data_fn', DATA_FN, 'data file')
|
|
flags.DEFINE_string('out_fn', OUT_FN, 'output file')
|
|
|
|
FLAGS = flags.FLAGS
|
|
|
|
# from here: https://gist.github.com/zmwangx/ad0830ba94b1fd98f428
|
|
def text_with_newlines(elem):
|
|
text = ''
|
|
for e in elem.descendants:
|
|
if isinstance(e, str):
|
|
# text += e.strip()
|
|
text += e
|
|
elif e.name == 'br' or e.name == 'p':
|
|
text += '\n'
|
|
return text
|
|
|
|
|
|
def parse_line(line):
|
|
data = json.loads(line)
|
|
posts_text = []
|
|
for post in data.get('posts', []):
|
|
try:
|
|
if 'com' in post:
|
|
soup = bs4.BeautifulSoup(post['com'], 'lxml')
|
|
post_text = text_with_newlines(soup).strip()
|
|
else:
|
|
post_text = ''
|
|
post_text = f'--- {post["no"]}\n{post_text}'
|
|
posts_text.append(post_text)
|
|
except Exception:
|
|
logger.exception(f'failed to parse post {post}')
|
|
return '\n'.join(posts_text)
|
|
|
|
|
|
def main(_):
|
|
with open(FLAGS.out_fn, 'w') as out_f:
|
|
with open(FLAGS.data_fn) as in_f:
|
|
with mp.Pool() as pool:
|
|
for parsed_line in pool.imap(parse_line, tqdm.tqdm(in_f)):
|
|
out_f.write(parsed_line + '\n-----\n')
|
|
|
|
|
|
if __name__ == '__main__':
|
|
app.run(main)
|