113 lines
3.5 KiB
Python
113 lines
3.5 KiB
Python
|
|
||
|
#se generate markdown that can be used with hugo
|
||
|
|
||
|
# A single JSON file that contains all the posts that we want to generate
|
||
|
# This file contains a JSON array with linkedin user profile posts
|
||
|
INP_FILE_PATH = "./out_posts_all_1.json"
|
||
|
# This file hold the current state of scrapping, what articles have been scrapped
|
||
|
STORAGE_FILE_PATH = "./posts_scrapping_state.json"
|
||
|
|
||
|
MD_ORGANIZED_CONTENT_BASE = "./janco_website/true_content/posts"
|
||
|
MD_FLAT_CONTENT_BASE = "./janco_website/content/posts"
|
||
|
|
||
|
import json
|
||
|
from urllib.parse import urljoin, urlparse
|
||
|
import uuid
|
||
|
import os
|
||
|
from typing import Optional
|
||
|
|
||
|
def slurp_file_and_parse(inp_file_path):
|
||
|
posts = []
|
||
|
with open(inp_file_path) as inp_file:
|
||
|
posts = json.loads(inp_file.read())
|
||
|
return posts
|
||
|
|
||
|
posts = slurp_file_and_parse(INP_FILE_PATH)
|
||
|
|
||
|
from pprint import pprint
|
||
|
from slugify import slugify
|
||
|
|
||
|
from dataclasses import dataclass
|
||
|
from datetime import datetime
|
||
|
|
||
|
@dataclass
|
||
|
class Post:
|
||
|
id: int
|
||
|
title: str
|
||
|
created_at: datetime
|
||
|
content: str
|
||
|
original_url: str
|
||
|
|
||
|
@property
|
||
|
def slug(self) -> str:
|
||
|
return slugify(self.title)
|
||
|
|
||
|
|
||
|
ARTICLE_COMPONENT = "com.linkedin.voyager.feed.render.ArticleComponent"
|
||
|
|
||
|
global_count = 0
|
||
|
|
||
|
def parse_post(raw_post):
|
||
|
global global_count
|
||
|
|
||
|
def datetime_from_id(id: int):
|
||
|
# thanks to https://github.com/Ollie-Boyd/Linkedin-post-timestamp-extractor
|
||
|
epoch = (id & 0xffffffffff800000) >> 22
|
||
|
return datetime.utcfromtimestamp(epoch/1000)
|
||
|
|
||
|
entity_id = int(raw_post['entityUrn'].split("activity:")[1].split(',')[0])
|
||
|
|
||
|
original_url = next(filter(lambda x: x['actionType'] == "SHARE_VIA", raw_post['updateMetadata']['updateActions']['actions']))['url']
|
||
|
original_url = urljoin(original_url, urlparse(original_url).path)
|
||
|
|
||
|
created_at = datetime_from_id(entity_id)
|
||
|
|
||
|
global_count += 1
|
||
|
out_post = Post(
|
||
|
id=entity_id,
|
||
|
title=f"CHANGE_ME {global_count}",
|
||
|
content=raw_post['commentary']['text']['text'],
|
||
|
created_at=created_at.replace(microsecond=0),
|
||
|
original_url=original_url
|
||
|
# title=raw_post
|
||
|
)
|
||
|
|
||
|
# content_items = list(raw_post['content'].items())
|
||
|
components = list(raw_post['content'].keys()) if 'content' in raw_post else []
|
||
|
# the "content" key contains some other components like the com.linkedin.voyager.feed.render.ArticleComponent that indicate there is an article (linked) attached to this post
|
||
|
|
||
|
if ARTICLE_COMPONENT in components:
|
||
|
article = raw_post['content'][ARTICLE_COMPONENT]
|
||
|
# pprint(article, depth=1)
|
||
|
# image_path = article['largeImage']['attributes'][0]['vectorImage']['artifacts'][0]['fileIdentifyingUrlPathSegment']
|
||
|
|
||
|
return out_post
|
||
|
|
||
|
import yaml
|
||
|
for post in posts:
|
||
|
pp = parse_post(post)
|
||
|
|
||
|
|
||
|
pp.created_at.year
|
||
|
pp.created_at.month
|
||
|
|
||
|
yaml_header = yaml.dump({
|
||
|
'title': pp.title,
|
||
|
'date': pp.created_at.isoformat(),
|
||
|
'li-id': pp.id,
|
||
|
'li-url': pp.original_url,
|
||
|
})
|
||
|
dir_container = f"{MD_ORGANIZED_CONTENT_BASE}/{pp.created_at.year}/{pp.created_at.month:02}"
|
||
|
if not os.path.isdir(dir_container):
|
||
|
os.makedirs(dir_container)
|
||
|
|
||
|
with open(f"{dir_container}/{pp.slug}.md", 'w') as f:
|
||
|
f.write("\n".join([
|
||
|
"---",
|
||
|
yaml_header.strip(),
|
||
|
"---",
|
||
|
"",
|
||
|
pp.content
|
||
|
]))
|
||
|
|