-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMaldonadoSebastian_086_P3C.py
206 lines (176 loc) · 8.75 KB
/
MaldonadoSebastian_086_P3C.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
# Filename: MaldonadoSebastian_086_P3.py
### ADD YOUR NAME, STUDENT ID AND SECTION NUMBER BELOW ###
# NAME: Sebastian G. Maldonado Rosado
# STUDENT ID: ###-##-####
# SECTION: 086
"""Parse the contents of an HTML file and output the internal resources used.
We are looking for tags of interest: a, script, link, img, audio, video, and form.
Within each tag of interest we're looking for a particular attribute of
interest (href for a & link, src for script & img, action for form).
Each tag of interest is to be represented by a dictionary, where the attribute names
will be the dictionary keys and the attribute values will be the dictionary values.
A list is created for each type of tag, storing all of the internal
resources referenced by tags of that type.
Finally, the results are stored in an output file.
Input: The file index.html will be used as an input file
Output: The results will be stored in a file named index_resources.txt
"""
# CONSTANTS
INPUTFILE = 'index.html'
OUTPUTFILE = 'index_resources.txt'
# We'll use a dictionary where the keys are the tags of interest and the values
# are the corresponding attributes of interest. That way we can process the HTML
# file using this dictionary without having to look for specific tags or attributes.
DICTOFINTEREST = {'a': 'href', 'link': 'href', 'form': 'action',
'img': 'src', 'script': 'src', 'audio': 'src', 'video': 'src'}
def load_data():
"""Returns the contents of the input file as a list of lines, or None if an error occurs."""
try:
fh = open(INPUTFILE)
except:
linesInFile = None
else: # Only gets executed if no exception was raised
linesInFile = fh.readlines()
fh.close()
return linesInFile
def get_tag_of_interest(line):
"""Return a tag of interest if one is found in the line, or None otherwise.
Parameters:
line - A single line of text from the HTML file being processed.
Returns:
A string with the (opening) tag of interest, if one is found, or None otherwise.
"""
# The tags of interest are the keys to the dictionary DICTOFINTEREST.
for tagName in DICTOFINTEREST:
# Note that, for a tag to have a resource, it must have a space after the tag name
openingTag = '<' + tagName + ' '
if openingTag in line: # Found it!
posTagBegin = line.find(openingTag)
# Make sure we don't just find any '>', but the next one after the start of the tag.
posTagEnd = line.find('>', posTagBegin)
return line[posTagBegin:posTagEnd + 1]
# If we're still in the function, then we didn't find any tags of interest.
return None
def get_attr_of_interest(tag):
"""Return value of attribute of interest if one is in the tag, or None otherwise.
Parameters:
tag - A tag (as a dict) within which we'll look for the attribute of interest.
Attribute names are the dict keys and attribute values are the dict values.
The tag name can be found as the value of the 'tagName' key.
Returns:
A string representing the value of the attribute of interest for the tag received,
or None if either the attribute is absent or if the resource is external.
"""
tagType = tag['tagName']
if tagType in DICTOFINTEREST: # Checks if the received dictionary has a tag of interest.
if DICTOFINTEREST[tagType] in tag: # If the tag is of interest, checks for an attribute of interest.
attr = DICTOFINTEREST[tagType]
value = tag[attr] # Extracts the value of the attribute of interest.
if value.startswith('http:') or value.startswith('https:'):
# If the attribute is external we don't want its value;
# function returns None.
return None
else:
return value
# If the dictionary fed into the function does not have
# a tag nor attribute of interest, we ignore it.
else:
return None
else:
return None
def write_results(dictOfResources):
"""Write all of the resources to an output file.
Parameters:
dictOfResources - Dictionary of resources to be saved in the output file.
The keys are the tags of interest and each value is a
list of all of the resouces for that type of tag.
"""
outFH = open(OUTPUTFILE, 'w') # Opens the output file in writing mode.
dicc = dictOfResources.items()
dicc = sorted(dicc) # Sorts tag keys by alphabetical order.
for tag, attr in dicc:
# If the tag key has a list as a value with more than
# zero elements, write the attribute values into the file.
if len(attr) > 0:
outFH.write(tag + '\n') # Writes the title of a section.
for element in attr: # Writes each attribute value
element = str(element)
outFH.write('\t' + element + '\n')
# If the tag key has an empty list as its value, no
# attributes of interest were found so no need to write a blank space.
# Continues to the next tag key.
else:
continue
outFH.close() # Closes the file when done. Remember to always close your files!
def tag_as_dict(openingTag):
"""Convert an opening HTML tag into a dictionary.
The attribute names will be the keys of the dictionary and the attribute values
will be the values of those keys. In the case of boolean attributes (the ones
that don't have a value assigned to them), the value will be set to True.
The dictionary will also have the special key 'tagName' to store the tag name
(e.g. img, audio).
NOTE: We assume attribute values DO NOT have spaces, and that the only spaces
in the tag are to separate attributes.
Parameters:
openingTag - The opening HTML tag to be converted into a dictionary.
Returns:
A dictionary representation of the tag, as detailed above.
"""
dicc = {}
tag = openingTag.split()
tagName = tag[0].split('<')
dicc['tagName'] = tagName[1] # Determines the type of tag in the line.
for i in range(1, len(tag)): # Finds each attribute in the tag and their value.
attribute = tag[i].split('=')
# Checks for boolean attributes; if present, add them True as value.
if len(attribute) == 1:
attr = tag[i].split('>')
attr = attr[0]
dicc[attr] = True
else: # If no boolean attribute exists in the tag, find other attributes.
attr = attribute[0]
value = attribute[1]
value = value.strip('">') # Extracts the value of an attribute
dicc[attr] = value
# The function returns a dictionary, which will be used
# to be fed into the get_attr_of_interest() function.
return dicc
def main():
lstOfLines = load_data()
if lstOfLines is None:
print('ERROR: Could not open {}!'.format(INPUTFILE))
exit()
# The following dictionary will store all of the tags with resources, using the
# tag name as the keys and a list of tags as the value.
resourcesDict = dict()
# Creates a list for each tag in DICTOFINTEREST, and adds it as a value to the
# corresponding key in the resourcesDict.
for tags in DICTOFINTEREST:
resourcesDict[tags] = list()
# Loops through the lines of a file, looking for tags and attributes of interest within them.
for line in lstOfLines:
tag = get_tag_of_interest(line)
if tag is not None: # If a tag of interest is found.
tag = tag_as_dict(tag)
attrVal = get_attr_of_interest(tag)
if attrVal is not None: # If the tag possesses an attribute of value.
# Adds the found value to its corresponding list inside resourcesDict
# depending on the attribute it came from.
if tag['tagName'] in DICTOFINTEREST:
resourcesDict[tag['tagName']].append(attrVal)
# If no tag of interest or no attribute of interest is found, continues
# to the next line of the file.
else:
continue
else:
continue
# Orders the attribute values in the lists inside resourcesDict (lists are the value of the keys
# inside this dictionary) by alphabetical order.
for tagKey in resourcesDict:
resourcesDict[tagKey].sort()
# Saves resourcesDict in an output file.
write_results(resourcesDict)
# This line makes python start the program from the main function
# unless our code is being imported
if __name__ == '__main__':
main()