import lzma import json import os import re from pytz import timezone from datetime import datetime import bisect from jinja2 import Environment, PackageLoader, select_autoescape scraper_path = 'scraper/' output_path='./static' env = Environment( loader=PackageLoader("builder"), autoescape=select_autoescape() ) blog_template = env.get_template("blog_template.html") post_template=env.get_template("post.html") try: os.makedirs(output_path) except FileExistsError as err: if not os.path.isdir(output_path): raise RuntimeError(f'output path {output_path} exists and is not a directory') from err posts = [] for folder in os.listdir(scraper_path): if os.path.isdir(os.path.join(scraper_path, folder)): post = {'pictures': []} for file in os.listdir(scraper_path + folder): if file.endswith('.xz'): if file.endswith('.xz'): with lzma.open(scraper_path + folder + '/' + file) as f: json_bytes = f.read() stri = json_bytes.decode('utf-8') data = json.loads(stri) # checks that we're looking at a downloaded post, not the profile picture or something if 'node' in data.keys() and 'date' in data['node'].keys(): timestamp = data['node']['date'] post['timestamp'] = timestamp #useful for sorting post['datetime'] = datetime.fromtimestamp(data['node']['date'], timezone("America/New_York")).strftime("%b %e, '%y") # use negative timestamp because that's the easiest way to sort from high to low bisect.insort_right(posts, post, key=lambda t: -t['timestamp']) if file.endswith('.webp') or file.endswith('.jpg'): filepath=os.path.join(scraper_path, folder, file) def get_index(t:str): extension_len = 6 if t.endswith('.jpg'): extension_len = 5 lastpart = filepath[filepath.rfind('_'):-extension_len] # will get the index of the image within the post if more than one if lastpart.isdigit(): return int(lastpart) else: return 0 bisect.insort_left(post['pictures'], filepath, key=get_index) if file.endswith('.txt'): filepath=os.path.join(scraper_path, folder, file) with open(filepath, 'r') as file: post['caption'] = file.read() if ('timestamp' in post.keys()): with open(output_path + 'posts/' + str(post['timestamp']) +'.html', 'w+') as output_file: output_file.write(post_template.render(post=post)) homepage_template = env.get_template("index.html") with open(output_path + 'index.html', "w+") as output_file: output_file.write(homepage_template.render()) with open(output_path + 'blog.html', "w+") as output_file: output_file.write(blog_template.render(posts=posts))