linkedin-posts-scraper/scrape.py

66 lines
2.3 KiB
Python
Raw Normal View History

2023-02-19 11:10:12 +00:00
from linkedin_api import Linkedin
import os
def fetch_posts(client, public_id=None, urn_id=None, post_count=10):
"""
get_profile_posts: Get profile posts
:param public_id: LinkedIn public ID for a profile
:type public_id: str, optional
:param urn_id: LinkedIn URN ID for a profile
:type urn_id: str, optional
:param post_count: Number of posts to fetch
:type post_count: int, optional
:return: List of posts
:rtype: list
"""
url_params = {
"count": min(post_count, client._MAX_POST_COUNT),
"start": 0,
"q": "memberShareFeed",
"moduleKey": "member-shares:phone",
"includeLongTermHistory": True,
}
if urn_id:
profile_urn = f"urn:li:fsd_profile:{urn_id}"
else:
profile = client.get_profile(public_id=public_id)
profile_urn = profile["profile_urn"].replace(
"fs_miniProfile", "fsd_profile"
)
url_params["profileUrn"] = profile_urn
url = f"/identity/profileUpdatesV2"
res = client._fetch(url, params=url_params)
data = res.json()
if data and "status" in data and data["status"] != 200:
client.logger.info("request failed: {}".format(data["message"]))
return {}
while data and data["metadata"]["paginationToken"] != "":
print(f"got {len(data['elements'])=}")
if len(data["elements"]) >= post_count:
break
pagination_token = data["metadata"]["paginationToken"]
url_params["start"] = url_params["start"] + client._MAX_POST_COUNT
url_params["paginationToken"] = pagination_token
print("new request", url_params)
res = client._fetch(url, params=url_params)
data["metadata"] = res.json()["metadata"]
data["elements"] = data["elements"] + res.json()["elements"]
data["paging"] = res.json()["paging"]
return data["elements"]
# Authenticate using any Linkedin account credentials
api = Linkedin(os.getenv("LINKEDIN_USERNAME"), os.getenv("LINKEDIN_PASSWORD"))
# GET a profile
# profile = api.get_profile('jean-marc-jancovici')
# profile_id = "ACoAAAYJ-P4B8XcfHOiVTdmiAyYGfvxBRs3J_Ug"
urn_id = "ACoAAAYJ-P4B8XcfHOiVTdmiAyYGfvxBRs3J_Ug"
posts = fetch_posts(api, urn_id=urn_id, post_count=800)
import json
with open('./out_posts_all_1.json', 'w') as f:
f.write(json.dumps(posts, indent=4))