#se generate markdown that can be used with hugo # A single JSON file that contains all the posts that we want to generate # This file contains a JSON array with linkedin user profile posts INP_FILE_PATH = "./out_posts_all_1.json" # This file hold the current state of scrapping, what articles have been scrapped STORAGE_FILE_PATH = "./posts_scrapping_state.json" MD_ORGANIZED_CONTENT_BASE = "./janco_website/true_content/posts" MD_FLAT_CONTENT_BASE = "./janco_website/content/posts" import json from urllib.parse import urljoin, urlparse import uuid import os from typing import Optional def slurp_file_and_parse(inp_file_path): posts = [] with open(inp_file_path) as inp_file: posts = json.loads(inp_file.read()) return posts posts = slurp_file_and_parse(INP_FILE_PATH) from pprint import pprint from slugify import slugify from dataclasses import dataclass from datetime import datetime @dataclass class Post: id: int title: str created_at: datetime content: str original_url: str @property def slug(self) -> str: return slugify(self.title) ARTICLE_COMPONENT = "com.linkedin.voyager.feed.render.ArticleComponent" global_count = 0 def parse_post(raw_post): global global_count def datetime_from_id(id: int): # thanks to https://github.com/Ollie-Boyd/Linkedin-post-timestamp-extractor epoch = (id & 0xffffffffff800000) >> 22 return datetime.utcfromtimestamp(epoch/1000) entity_id = int(raw_post['entityUrn'].split("activity:")[1].split(',')[0]) original_url = next(filter(lambda x: x['actionType'] == "SHARE_VIA", raw_post['updateMetadata']['updateActions']['actions']))['url'] original_url = urljoin(original_url, urlparse(original_url).path) created_at = datetime_from_id(entity_id) global_count += 1 out_post = Post( id=entity_id, title=f"CHANGE_ME {global_count}", content=raw_post['commentary']['text']['text'], created_at=created_at.replace(microsecond=0), original_url=original_url # title=raw_post ) # content_items = list(raw_post['content'].items()) components = list(raw_post['content'].keys()) if 'content' in raw_post else [] # the "content" key contains some other components like the com.linkedin.voyager.feed.render.ArticleComponent that indicate there is an article (linked) attached to this post if ARTICLE_COMPONENT in components: article = raw_post['content'][ARTICLE_COMPONENT] # pprint(article, depth=1) # image_path = article['largeImage']['attributes'][0]['vectorImage']['artifacts'][0]['fileIdentifyingUrlPathSegment'] return out_post import yaml for post in posts: pp = parse_post(post) pp.created_at.year pp.created_at.month yaml_header = yaml.dump({ 'title': pp.title, 'date': pp.created_at.isoformat(), 'li-id': pp.id, 'li-url': pp.original_url, }) dir_container = f"{MD_ORGANIZED_CONTENT_BASE}/{pp.created_at.year}/{pp.created_at.month:02}" if not os.path.isdir(dir_container): os.makedirs(dir_container) with open(f"{dir_container}/{pp.slug}.md", 'w') as f: f.write("\n".join([ "---", yaml_header.strip(), "---", "", pp.content ]))