linkedin-posts-scraper/gen_content.py


#se generate markdown that can be used with hugo

# A single JSON file that contains all the posts that we want to generate
# This file contains a JSON array with linkedin user profile posts
INP_FILE_PATH = "./out_posts_all_1.json"
# This file hold the current state of scrapping, what articles have been scrapped
STORAGE_FILE_PATH = "./posts_scrapping_state.json"

MD_ORGANIZED_CONTENT_BASE = "./janco_website/true_content/posts"
MD_FLAT_CONTENT_BASE = "./janco_website/content/posts"

import json
from urllib.parse import urljoin, urlparse
import uuid
import os
from typing import Optional

def slurp_file_and_parse(inp_file_path):
    posts = []
    with open(inp_file_path) as inp_file:
        posts = json.loads(inp_file.read())
    return posts

posts = slurp_file_and_parse(INP_FILE_PATH)

from pprint import pprint
from slugify import slugify

from dataclasses import dataclass
from datetime import datetime

@dataclass
class Post:
    id: int
    title: str
    created_at: datetime
    content: str
    original_url: str

    @property
    def slug(self) -> str:
        return slugify(self.title)


ARTICLE_COMPONENT = "com.linkedin.voyager.feed.render.ArticleComponent"

global_count = 0

def parse_post(raw_post):
    global global_count

    def datetime_from_id(id: int):
        # thanks to https://github.com/Ollie-Boyd/Linkedin-post-timestamp-extractor
        epoch = (id & 0xffffffffff800000) >> 22
        return datetime.utcfromtimestamp(epoch/1000)

    entity_id = int(raw_post['entityUrn'].split("activity:")[1].split(',')[0])

    original_url = next(filter(lambda x: x['actionType'] == "SHARE_VIA", raw_post['updateMetadata']['updateActions']['actions']))['url']
    original_url = urljoin(original_url, urlparse(original_url).path)

    created_at = datetime_from_id(entity_id)

    global_count += 1
    out_post = Post(
        id=entity_id,
        title=f"CHANGE_ME {global_count}",
        content=raw_post['commentary']['text']['text'],
        created_at=created_at.replace(microsecond=0),
        original_url=original_url
        # title=raw_post
    )

    # content_items = list(raw_post['content'].items())
    components = list(raw_post['content'].keys()) if 'content' in raw_post else []
    # the "content" key contains some other components like the com.linkedin.voyager.feed.render.ArticleComponent that indicate there is an article (linked) attached to this post

    if ARTICLE_COMPONENT in components:
        article = raw_post['content'][ARTICLE_COMPONENT]
        # pprint(article, depth=1)
        # image_path = article['largeImage']['attributes'][0]['vectorImage']['artifacts'][0]['fileIdentifyingUrlPathSegment']

    return out_post

import yaml
for post in posts:
    pp = parse_post(post)


    pp.created_at.year
    pp.created_at.month

    yaml_header = yaml.dump({
                                'title': pp.title,
                                'date': pp.created_at.isoformat(),
                                'li-id': pp.id,
                                'li-url': pp.original_url,
                            })
    dir_container = f"{MD_ORGANIZED_CONTENT_BASE}/{pp.created_at.year}/{pp.created_at.month:02}"
    if not os.path.isdir(dir_container):
        os.makedirs(dir_container)

    with open(f"{dir_container}/{pp.slug}.md", 'w') as f:
        f.write("\n".join([
                              "---",
                              yaml_header.strip(),
                              "---",
                              "",
                              pp.content
                          ]))