-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathscraper.py
More file actions
104 lines (88 loc) · 2.89 KB
/
scraper.py
File metadata and controls
104 lines (88 loc) · 2.89 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
from collections import defaultdict
import requests, json
from bs4 import BeautifulSoup
from dataclasses import dataclass, field, InitVar, asdict
from typing import List
@dataclass
class Meeting:
title: str
detail_link: str = ""
webcast_link: str = ""
@dataclass
class CommMember:
name: str
page_link: str = ""
image_link: str = ""
@dataclass
class Committee:
name: str
category: str = ""
link: str = ""
council_description: str = ""
members: List[CommMember] = field(default_factory=list)
meetings: List[Meeting] = field(default_factory=list)
with open("public/council_committee_JSON.json") as f:
committee_list = json.load(f)
cat_comm = defaultdict(lambda: [])
comm_cat = {}
for c in committee_list:
cat_comm[c["Category"]].append(c["Committee"])
comm_cat[c["Committee"]] = c["Category"]
# for cat, comm in cat_comm.items():
# print(f"{cat} {len(comm)} {comm}")
def soupify(link):
r = requests.get(link)
return BeautifulSoup(r.content, "html.parser")
def get_members(link, domain):
l = []
s = soupify(link)
if m_title := s.find("a", attrs={"name": "#MEM"}):
m_list = m_title.find_next("ul")
for m in m_list.find_all("li"):
if lnk := m.find("a"):
cm = CommMember(lnk.text)
cm.page_link = domain+lnk.get("href")
else:
cm = CommMember(m.text)
l.append(cm)
return l
def get_meeting_link(s):
for lnk in s.find_all("a"):
if "Browse meetings and agendas for this committee" in lnk.text:
return lnk.get("href")
def get_meetings(link, domain):
meetings = []
s = soupify(link)
if m_title := s.find("div", {"class": "mgTableTitleTxt"}):
m_list = m_title.find_next("ul")
for m in m_list.find_all("li"):
lnk = m.find("a")
metg = Meeting(lnk.text)
metg.detail_link = domain+lnk.get("href")
meetings.append(metg)
return meetings
start = "https://committees.aberdeencity.gov.uk/mgListCommittees.aspx?bcr=1"
domain = "https://committees.aberdeencity.gov.uk/"
comm_soup = soupify(start)
content_div = comm_soup.find("div", attrs={"class": "mgContent"})
#print(content_div)
committees = []
for comm in content_div.find_all("li"):
a = comm.find("a")
c = Committee(name=a.text)
if c.name == "Council":
#TODO: decide how we want to handle full council
continue
c.link = domain+a.get("href")
if c.name in comm_cat.keys():
c.category = comm_cat[c.name]
else:
c.category = "Other"
c.members = get_members(c.link, domain)
c.meetings = get_meetings(domain+get_meeting_link(soupify(c.link)), domain)
committees.append(c)
print(".", end="", flush=True)
print("")
cmt_json = json.dumps([asdict(c) for c in committees], indent=4)
with open("committees.json", "w") as f:
f.write(cmt_json)