-
Notifications
You must be signed in to change notification settings - Fork 1
/
web_scraping.py
83 lines (60 loc) · 2.43 KB
/
web_scraping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
#!/usr/bin/env python3
"""
Script to scrape relevant DICOM Standard information from HTML tables and store them locally as JSON files
Author: Jason Lunn, The Institute of Cancer Research, UK
"""
import json
from bs4 import BeautifulSoup
import requests
def standard_sopclassuids():
url = "http://dicom.nema.org/dicom/2013/output/chtml/part04/sect_B.5.html"
html = requests.get(url).content
soup = BeautifulSoup(html)
table = soup.find_all('tbody')[0]
data_dict = {}
for row in table.find_all('tr'):
name = row.find_all('td')[0].text
uid = row.find_all('td')[1].text
data = (name.strip('\n'), uid.strip('\n'))
data_dict[f"{data}"] = False
with open("/Users/jlunn/Desktop/sopclassuids.json", 'w') as json_file:
json.dump(data_dict, json_file, indent=4)
def confidentiality_profiles():
url = "https://dicom.nema.org/medical/dicom/current/output/html/part15.html#sect_E.1"
html = requests.get(url).content
soup = BeautifulSoup(html, features="lxml")
table_link = soup.find('a', id='table_E.1-1')
parent_div = table_link.parent
table_body = parent_div.find('tbody')
attribute_dict = {}
table_headers = {0: 'attribute_name',
1: 'tag_number',
2: 'retired',
3: 'in_standard_composite_iod',
4: 'basic_profile',
5: 'retain_private',
6: 'retain_uids',
7: 'retain_device_id',
8: 'retain_institution_id',
9: 'retain_patient_characteristics',
10: 'retain_full_dates',
11: 'retain_modified_dates',
12: 'clean_descriptions',
13: 'clean_structured_content',
14: 'clean_graphical'}
for row in table_body.find_all('tr'):
row_dict = {}
table_cells = row.find_all('td')
for index, label in table_headers.items():
data = table_cells[index].text
row_dict[label] = data.strip('\n')
if index == 1:
tag_number = data.strip('\n')
attribute_dict[tag_number] = row_dict
with open("/Users/jlunn/Desktop/dicom_profiles.json", 'w') as json_file:
json.dump(attribute_dict, json_file, indent=4)
def main():
# standard_sopclassuids()
confidentiality_profiles()
if __name__ == '__main__':
main()