linkedin-posts-scraper/gen_content.py

113 lines
3.5 KiB
Python
Raw Normal View History

2023-02-19 11:10:12 +00:00
#se generate markdown that can be used with hugo
# A single JSON file that contains all the posts that we want to generate
# This file contains a JSON array with linkedin user profile posts
INP_FILE_PATH = "./out_posts_all_1.json"
# This file hold the current state of scrapping, what articles have been scrapped
STORAGE_FILE_PATH = "./posts_scrapping_state.json"
MD_ORGANIZED_CONTENT_BASE = "./janco_website/true_content/posts"
MD_FLAT_CONTENT_BASE = "./janco_website/content/posts"
import json
from urllib.parse import urljoin, urlparse
import uuid
import os
from typing import Optional
def slurp_file_and_parse(inp_file_path):
posts = []
with open(inp_file_path) as inp_file:
posts = json.loads(inp_file.read())
return posts
posts = slurp_file_and_parse(INP_FILE_PATH)
from pprint import pprint
from slugify import slugify
from dataclasses import dataclass
from datetime import datetime
@dataclass
class Post:
id: int
title: str
created_at: datetime
content: str
original_url: str
@property
def slug(self) -> str:
return slugify(self.title)
ARTICLE_COMPONENT = "com.linkedin.voyager.feed.render.ArticleComponent"
global_count = 0
def parse_post(raw_post):
global global_count
def datetime_from_id(id: int):
# thanks to https://github.com/Ollie-Boyd/Linkedin-post-timestamp-extractor
epoch = (id & 0xffffffffff800000) >> 22
return datetime.utcfromtimestamp(epoch/1000)
entity_id = int(raw_post['entityUrn'].split("activity:")[1].split(',')[0])
original_url = next(filter(lambda x: x['actionType'] == "SHARE_VIA", raw_post['updateMetadata']['updateActions']['actions']))['url']
original_url = urljoin(original_url, urlparse(original_url).path)
created_at = datetime_from_id(entity_id)
global_count += 1
out_post = Post(
id=entity_id,
title=f"CHANGE_ME {global_count}",
content=raw_post['commentary']['text']['text'],
created_at=created_at.replace(microsecond=0),
original_url=original_url
# title=raw_post
)
# content_items = list(raw_post['content'].items())
components = list(raw_post['content'].keys()) if 'content' in raw_post else []
# the "content" key contains some other components like the com.linkedin.voyager.feed.render.ArticleComponent that indicate there is an article (linked) attached to this post
if ARTICLE_COMPONENT in components:
article = raw_post['content'][ARTICLE_COMPONENT]
# pprint(article, depth=1)
# image_path = article['largeImage']['attributes'][0]['vectorImage']['artifacts'][0]['fileIdentifyingUrlPathSegment']
return out_post
import yaml
for post in posts:
pp = parse_post(post)
pp.created_at.year
pp.created_at.month
yaml_header = yaml.dump({
'title': pp.title,
'date': pp.created_at.isoformat(),
'li-id': pp.id,
'li-url': pp.original_url,
})
dir_container = f"{MD_ORGANIZED_CONTENT_BASE}/{pp.created_at.year}/{pp.created_at.month:02}"
if not os.path.isdir(dir_container):
os.makedirs(dir_container)
with open(f"{dir_container}/{pp.slug}.md", 'w') as f:
f.write("\n".join([
"---",
yaml_header.strip(),
"---",
"",
pp.content
]))