Last active 2 hours ago

kat's Avatar kat revised this gist 2 hours ago. Go to revision

1 file changed, 3 insertions, 2 deletions

lj_albums.py

@@ -60,6 +60,7 @@ with open("list.txt", "r") as file:
60 60 # find each photo description
61 61 descriptions = i.find_all_next("p", {"class" : lambda L: L and L.startswith("Description")})
62 62 # throw it all into a csv file
63 - for src, txt in zip(img_srcs_all, descriptions):
63 + for src, desc in zip(img_srcs_all, descriptions):
64 64 s = src.replace("600", "original")
65 - writer.writerow([s, txt.text])
65 + t = desc.text
66 + writer.writerow([s, t])

kat's Avatar kat revised this gist 3 hours ago. Go to revision

1 file changed, 50 insertions, 28 deletions

lj_albums.py

@@ -12,32 +12,54 @@ from bs4 import BeautifulSoup
12 12 from selenium import webdriver
13 13 from selenium.webdriver import Chrome
14 14 from selenium.webdriver.chrome.options import Options
15 + import urllib.parse
15 16
16 - url = "https://dani-ellie.livejournal.com/photo/album/2422/"
17 - csv_file = "album.csv"
18 -
19 - options = webdriver.ChromeOptions()
20 - options.add_argument('--headless')
21 - driver = webdriver.Chrome(options=options)
22 -
23 - driver.get(url)
24 - page_body = driver.page_source
25 -
26 - soup = BeautifulSoup(page_body, "html.parser")
27 -
28 - containers = soup.find_all("a", {"class" : lambda L: L and L.startswith("Container")})
29 -
30 - with open(csv_file, mode="w", newline="") as f:
31 - writer = csv.writer(f)
32 - header = ["source", "description"]
33 - writer.writerow(header)
34 - for body in containers:
35 - img_divs = body.find_all("div", {"class" : lambda L: L and L.startswith("ImgWrapper")})
36 - for t in img_divs:
37 - img_tags = t.find_all("img")
38 - img_srcs_all = [img["src"] for img in img_tags]
39 - for i in img_tags:
40 - descriptions = i.find_all_next("p", {"class" : lambda L: L and L.startswith("Description")})
41 - for src, txt in zip(img_srcs_all, descriptions):
42 - s = src.replace("600", "original")
43 - writer.writerow([s, txt.text])
17 + # read newline separated list of urls
18 + with open("list.txt", "r") as file:
19 + for l in file:
20 + url = l
21 +
22 + # break down url to get username & album id
23 + # this is for csv file naming
24 + parsed_url = urllib.parse.urlparse(url)
25 + url_path = parsed_url.path
26 + album_user = parsed_url.hostname.split(".")[0]
27 + album_id = url_path.split("/")[3] # get id from album url
28 +
29 + # define csv file name
30 + csv_file = album_user + "_" + album_id + ".csv"
31 +
32 + # selenium scraping stuff
33 + options = webdriver.ChromeOptions()
34 + options.add_argument('--headless')
35 + driver = webdriver.Chrome(options=options)
36 + driver.get(url)
37 + page_body = driver.page_source
38 +
39 + # start parsing
40 + soup = BeautifulSoup(page_body, "html.parser")
41 +
42 + # get main container for photo album
43 + containers = soup.find_all("a", {"class" : lambda L: L and L.startswith("Container")})
44 +
45 + # create & open csv file
46 + with open(csv_file, mode="w", newline="") as f:
47 + writer = csv.writer(f)
48 + # write headers to csv file
49 + header = ["source", "description"]
50 + writer.writerow(header)
51 + for body in containers:
52 + # find all img wrappers/divs
53 + img_divs = body.find_all("div", {"class" : lambda L: L and L.startswith("ImgWrapper")})
54 + for t in img_divs:
55 + # find all img tags
56 + img_tags = t.find_all("img")
57 + # find all img src attributes
58 + img_srcs_all = [img["src"] for img in img_tags]
59 + for i in img_tags:
60 + # find each photo description
61 + descriptions = i.find_all_next("p", {"class" : lambda L: L and L.startswith("Description")})
62 + # throw it all into a csv file
63 + for src, txt in zip(img_srcs_all, descriptions):
64 + s = src.replace("600", "original")
65 + writer.writerow([s, txt.text])

kat's Avatar kat revised this gist 5 hours ago. Go to revision

1 file changed, 43 insertions

lj_albums.py(file created)

@@ -0,0 +1,43 @@
1 + #!/usr/bin/env -S uv run --script
2 + # /// script
3 + # requires-python = ">=3.14"
4 + # dependencies = [
5 + # "bs4>=0.0.2",
6 + # "selenium>=4.44.0",
7 + # ]
8 + # ///
9 +
10 + import csv
11 + from bs4 import BeautifulSoup
12 + from selenium import webdriver
13 + from selenium.webdriver import Chrome
14 + from selenium.webdriver.chrome.options import Options
15 +
16 + url = "https://dani-ellie.livejournal.com/photo/album/2422/"
17 + csv_file = "album.csv"
18 +
19 + options = webdriver.ChromeOptions()
20 + options.add_argument('--headless')
21 + driver = webdriver.Chrome(options=options)
22 +
23 + driver.get(url)
24 + page_body = driver.page_source
25 +
26 + soup = BeautifulSoup(page_body, "html.parser")
27 +
28 + containers = soup.find_all("a", {"class" : lambda L: L and L.startswith("Container")})
29 +
30 + with open(csv_file, mode="w", newline="") as f:
31 + writer = csv.writer(f)
32 + header = ["source", "description"]
33 + writer.writerow(header)
34 + for body in containers:
35 + img_divs = body.find_all("div", {"class" : lambda L: L and L.startswith("ImgWrapper")})
36 + for t in img_divs:
37 + img_tags = t.find_all("img")
38 + img_srcs_all = [img["src"] for img in img_tags]
39 + for i in img_tags:
40 + descriptions = i.find_all_next("p", {"class" : lambda L: L and L.startswith("Description")})
41 + for src, txt in zip(img_srcs_all, descriptions):
42 + s = src.replace("600", "original")
43 + writer.writerow([s, txt.text])
Newer Older