linkedin-posts-scraper/gen_content.py


#se generate markdown that can be used with hugo

# A single JSON file that contains all the posts that we want to generate
# This file contains a JSON array with linkedin user profile posts
INP_FILE_PATH = "./out_posts_all_1.json"
# This file hold the current state of scrapping, what articles have been scrapped
STORAGE_FILE_PATH = "./posts_scrapping_state.json"

MD_ORGANIZED_CONTENT_BASE = "./janco_website/true_content/posts"
MD_FLAT_CONTENT_BASE = "./janco_website/content/posts"

import json
from urllib.parse import urljoin, urlparse
import uuid
import os
from typing import Optional

def slurp_file_and_parse(inp_file_path):
    posts = []
    with open(inp_file_path) as inp_file:
        posts = json.loads(inp_file.read())
    return posts

posts = slurp_file_and_parse(INP_FILE_PATH)

from pprint import pprint
from slugify import slugify

from dataclasses import dataclass
from datetime import datetime

@dataclass
class Post:
    id: int
    title: str
    created_at: datetime
    content: str
    original_url: str
    
    @property
    def slug(self) -> str:
        return slugify(self.title)
        

ARTICLE_COMPONENT = "com.linkedin.voyager.feed.render.ArticleComponent"

global_count = 0

def parse_post(raw_post):
    global global_count

    def datetime_from_id(id: int):
        # thanks to https://github.com/Ollie-Boyd/Linkedin-post-timestamp-extractor
        epoch = (id & 0xffffffffff800000) >> 22
        return datetime.utcfromtimestamp(epoch/1000)
        
    entity_id = int(raw_post['entityUrn'].split("activity:")[1].split(',')[0])

    original_url = next(filter(lambda x: x['actionType'] == "SHARE_VIA", raw_post['updateMetadata']['updateActions']['actions']))['url']
    original_url = urljoin(original_url, urlparse(original_url).path)

    created_at = datetime_from_id(entity_id)

    global_count += 1
    out_post = Post(
        id=entity_id,
        title=f"CHANGE_ME {global_count}",
        content=raw_post['commentary']['text']['text'],
        created_at=created_at.replace(microsecond=0),
        original_url=original_url
        # title=raw_post
    )
    
    # content_items = list(raw_post['content'].items())
    components = list(raw_post['content'].keys()) if 'content' in raw_post else []
    # the "content" key contains some other components like the com.linkedin.voyager.feed.render.ArticleComponent that indicate there is an article (linked) attached to this post

    if ARTICLE_COMPONENT in components:
        article = raw_post['content'][ARTICLE_COMPONENT]
        # pprint(article, depth=1)
        # image_path = article['largeImage']['attributes'][0]['vectorImage']['artifacts'][0]['fileIdentifyingUrlPathSegment']

    return out_post

import yaml
for post in posts:
    pp = parse_post(post)


    pp.created_at.year
    pp.created_at.month

    yaml_header = yaml.dump({
                                'title': pp.title,
                                'date': pp.created_at.isoformat(),
                                'li-id': pp.id,
                                'li-url': pp.original_url,
                            })
    dir_container = f"{MD_ORGANIZED_CONTENT_BASE}/{pp.created_at.year}/{pp.created_at.month:02}"
    if not os.path.isdir(dir_container):
        os.makedirs(dir_container)

    with open(f"{dir_container}/{pp.slug}.md", 'w') as f:
        f.write("\n".join([
                              "---",
                              yaml_header.strip(),
                              "---",
                              "",
                              pp.content
                          ]))
initial commit 2023-02-19 11:10:12 +00:00
			`#se generate markdown that can be used with hugo`

			`# A single JSON file that contains all the posts that we want to generate`
			`# This file contains a JSON array with linkedin user profile posts`
			`INP_FILE_PATH = "./out_posts_all_1.json"`
			`# This file hold the current state of scrapping, what articles have been scrapped`
			`STORAGE_FILE_PATH = "./posts_scrapping_state.json"`

			`MD_ORGANIZED_CONTENT_BASE = "./janco_website/true_content/posts"`
			`MD_FLAT_CONTENT_BASE = "./janco_website/content/posts"`

			`import json`
			`from urllib.parse import urljoin, urlparse`
			`import uuid`
			`import os`
			`from typing import Optional`

			`def slurp_file_and_parse(inp_file_path):`
			`posts = []`
			`with open(inp_file_path) as inp_file:`
			`posts = json.loads(inp_file.read())`
			`return posts`

			`posts = slurp_file_and_parse(INP_FILE_PATH)`

			`from pprint import pprint`
			`from slugify import slugify`

			`from dataclasses import dataclass`
			`from datetime import datetime`

			`@dataclass`
			`class Post:`
			`id: int`
			`title: str`
			`created_at: datetime`
			`content: str`
			`original_url: str`

			`@property`
			`def slug(self) -> str:`
			`return slugify(self.title)`


			`ARTICLE_COMPONENT = "com.linkedin.voyager.feed.render.ArticleComponent"`

			`global_count = 0`

			`def parse_post(raw_post):`
			`global global_count`

			`def datetime_from_id(id: int):`
			`# thanks to https://github.com/Ollie-Boyd/Linkedin-post-timestamp-extractor`
			`epoch = (id & 0xffffffffff800000) >> 22`
			`return datetime.utcfromtimestamp(epoch/1000)`

			`entity_id = int(raw_post['entityUrn'].split("activity:")[1].split(',')[0])`

			`original_url = next(filter(lambda x: x['actionType'] == "SHARE_VIA", raw_post['updateMetadata']['updateActions']['actions']))['url']`
			`original_url = urljoin(original_url, urlparse(original_url).path)`

			`created_at = datetime_from_id(entity_id)`

			`global_count += 1`
			`out_post = Post(`
			`id=entity_id,`
			`title=f"CHANGE_ME {global_count}",`
			`content=raw_post['commentary']['text']['text'],`
			`created_at=created_at.replace(microsecond=0),`
			`original_url=original_url`
			`# title=raw_post`
			`)`

			`# content_items = list(raw_post['content'].items())`
			`components = list(raw_post['content'].keys()) if 'content' in raw_post else []`
			`# the "content" key contains some other components like the com.linkedin.voyager.feed.render.ArticleComponent that indicate there is an article (linked) attached to this post`

			`if ARTICLE_COMPONENT in components:`
			`article = raw_post['content'][ARTICLE_COMPONENT]`
			`# pprint(article, depth=1)`
			`# image_path = article['largeImage']['attributes'][0]['vectorImage']['artifacts'][0]['fileIdentifyingUrlPathSegment']`

			`return out_post`

			`import yaml`
			`for post in posts:`
			`pp = parse_post(post)`


			`pp.created_at.year`
			`pp.created_at.month`

			`yaml_header = yaml.dump({`
			`'title': pp.title,`
			`'date': pp.created_at.isoformat(),`
			`'li-id': pp.id,`
			`'li-url': pp.original_url,`
			`})`
			`dir_container = f"{MD_ORGANIZED_CONTENT_BASE}/{pp.created_at.year}/{pp.created_at.month:02}"`
			`if not os.path.isdir(dir_container):`
			`os.makedirs(dir_container)`

			`with open(f"{dir_container}/{pp.slug}.md", 'w') as f:`
			`f.write("\n".join([`
			`"---",`
			`yaml_header.strip(),`
			`"---",`
			`"",`
			`pp.content`
			`]))`