-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathkb_data.py
289 lines (260 loc) · 14.8 KB
/
kb_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
#!/usr/bin/env python
""" kb_data.py: Provides resolution table from VMware KBs as machine-readable json files.
VMware KBs provide release information only as a human-readable HTML table.
However, for automation it would be nice to have it in a machine-readable format.
This script takes the tables from a VMware KB page and provides a json-file as an output.
This program is free software: you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
Foundation, either version 3 of the License, or (at your option) any later
version.
This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
You should have received a copy of the GNU General Public License along with
this program. If not, see <http://www.gnu.org/licenses/>.
"""
__author__ = "Dominik Zorgnotti"
__contact__ = "dominik@why-did-it.fail"
__created__ = "2020-02-26"
__deprecated__ = False
__contact__ = "dominik@why-did-it.fail"
__license__ = "GPLv3"
__status__ = "beta"
__version__ = "0.4.0"
import pandas as pd
from data_handling import standardize_columns
from webparsing import get_kb_webdata
# YOLO as I am okay with overwriting DF data regardless of the results
pd.options.mode.chained_assignment = None # default='warn'
class KbData:
def __init__(self, kb_id):
self.id = kb_id
self.raw_html_article = get_kb_webdata(kb_id)
self.title = self.get_title()
self.product = self.get_first_product_name()
self.fmt_product = self.format_product_name()
self.raw_html_resolution = self.get_resolution_section()
self.list_of_dframes = self.parse_releasedata()
self.list_of_merged_frames = None
def get_resolution_section(self):
"""Extracts the resolution section from the KB article content section"""
# Get the Section within the webpage that holds the desired data (make it a bit more targeted)
if "Resolution" in self.raw_html_article["content"][1]:
resolution = self.raw_html_article["content"][1]["Resolution"]
elif "Resolution" in self.raw_html_article["content"][0]:
resolution = self.raw_html_article["content"][0]["Resolution"]
else:
raise ValueError("No resolution section in this page!")
return resolution
def get_first_product_name(self):
"""Extracts the first product mentioned in the KB articles meta data"""
# Get the product, quick win - just pick the first in the list
return self.raw_html_article["meta"]["articleProducts"]["relatedProducts"][0]
def format_product_name(self):
"""Makes a lower-case-version without spaces from the product name and returns it as a string"""
return self.product.lower().strip().replace(" ", "_")
def get_title(self):
"""Extracts the first product mentioned in the KB articles meta data"""
# Get the product, quick win - just pick the first in the list
return self.raw_html_article["meta"]["articleInfo"]["title"]
def parse_releasedata(self):
"""Accepts the html data for product releases from the KB article for parsing with pandas."""
df = pd.read_html(self.raw_html_resolution, flavor="bs4")
# Contains a list of all tables converted to dataframes in the resolution section
list_of_release_df = []
dict_of_releases = {}
for table_id in range(len(df)):
# Since some HTML table have no header, we need to reassign the first row as heading
if "Version" not in df[table_id].columns:
df_header = df[table_id][:1]
current_df = df[table_id][1:]
current_df.columns = df_header.values.tolist()[0]
# Moving the del up here
del df_header
else:
current_df = df[table_id]
releaseinfo_dataframe = standardize_columns(current_df)
# Get the data types right, especially the date format='%m/%d/%Y'
if "Release Date" in current_df.columns:
releaseinfo_dataframe["Release Date"] = pd.to_datetime(current_df["Release Date"],
infer_datetime_format=True, errors='coerce')
list_of_release_df.append(releaseinfo_dataframe)
# Fun stuff may happen with dataframes if not erased before the next iteration
del current_df, releaseinfo_dataframe
return list_of_release_df
# You might read this and say "he's drunk!". Alas, it's pure desperation.
# vCenter releases
class Kb2143838(KbData):
def __init__(self, kb_id):
super().__init__(kb_id)
self.list_of_dframes = self.parse_releasedata()
self.list_of_merged_frames = self.merge_tables_kb2143838()
def parse_releasedata(self):
"""Accepts the html data for product releases from the KB article for parsing with pandas."""
df = pd.read_html(self.raw_html_resolution, flavor="bs4")
# Contains a list of all tables converted to dataframes in the resolution section
list_of_release_df = []
for table_id in range(len(df)):
# VCSA 7
if table_id == 0:
vcenter7_table = df[table_id]
reformatted_df = self.transform_kb2143838(vcenter7_table)
reformatted_df["Edition"] = "VCSA"
reformatted_df["Release Date"] = pd.to_datetime(reformatted_df["Release Date"],
infer_datetime_format=True,
errors='coerce')
list_of_release_df.append(reformatted_df)
# VCSA/Windows 6.7
elif table_id == 1:
vcenter67_table = df[table_id]
product_editions = ["VCSA", "Windows"]
for product_edition in product_editions:
split_df = self.split_kb2143838(vcenter67_table, product_edition)
reformatted_df = self.transform_kb2143838(split_df)
reformatted_df["Release Date"] = pd.to_datetime(reformatted_df["Release Date"],
infer_datetime_format=True,
errors='coerce')
list_of_release_df.append(reformatted_df)
del split_df
# VCSA/Windows less equal 6.5
elif table_id == 2:
# The HTML table have no header, we need to reassign the first row as heading
df_header = df[table_id][:1]
vcenter_le65_table = df[table_id][1:]
vcenter_le65_table.columns = df_header.values.tolist()[0]
# Get the data types right, especially the date format='%m/%d/%Y'
vcenter_le65_table["Release Date"] = pd.to_datetime(vcenter_le65_table["Release Date"],
infer_datetime_format=True,
errors='coerce')
# Filter VCSA releases by keyword "Appliance", for Windows negate the search
vcsa_le65 = vcenter_le65_table[vcenter_le65_table["Version"].str.contains("appliance", case=False)]
vcsa_le65["Edition"] = "VCSA"
winvc_le65 = vcenter_le65_table[~vcenter_le65_table["Version"].str.contains("appliance", case=False)]
winvc_le65["Edition"] = "Windows"
list_of_release_df.append(vcsa_le65)
list_of_release_df.append(winvc_le65)
else:
print("Unknown table added, please add handling")
return list_of_release_df
def split_kb2143838(self, dataframe, product_edition):
"""Splits a dataframe based on the product edition (VCSA, Windows) and returns the output dataframe"""
tempdf_headless = dataframe[dataframe[0] == product_edition]
tempdf_header = tempdf_headless[:1]
tempdf = tempdf_headless[1:]
tempdf.columns = tempdf_header.values.tolist()[0]
tempdf.rename(columns={product_edition: "Edition"}, inplace=True)
return tempdf
def transform_kb2143838(self, dataframe):
"""Special handling of KB2143838 (vCenter)"""
# When you access the vCenter API the values from this column are returned, alias it as Build Number
if "Client/MOB/vpxd.log" in dataframe.columns:
dataframe["Build Number"] = dataframe["Client/MOB/vpxd.log"]
if "Version" in dataframe.columns:
# Normalize unicode with none breaking space in some rows
dataframe["Version"] = dataframe["Version"].str.normalize("NFKD")
tempdf = dataframe.rename(columns={"Version": "Version - Release Name"})
tempdf[["Version", "Release Name"]] = tempdf["Version - Release Name"].str.split(pat=r"(", expand=True)
# Remove ) and trailing space
tempdf["Release Name"] = tempdf["Release Name"].str.strip(r")")
tempdf["Version"] = tempdf["Version"].str.strip()
return tempdf
def merge_tables_kb2143838(self):
"""Accepts a list of dataframes, merge them and return a list of the merged df"""
# Return this list when ready
merged_vcenter_tables = {}
# Prepare the tables
vc7x_vcsa = self.list_of_dframes[0]
vc67_vcsa = self.list_of_dframes[1]
vc67_win = self.list_of_dframes[2]
vc65le_vcsa = self.list_of_dframes[3]
vc65le_win = self.list_of_dframes[4]
# Solved by WET
# Merge VCSA tables
merged_vcsa_builds = vc7x_vcsa.append(vc67_vcsa)
merged_vcsa_builds = merged_vcsa_builds.append(vc65le_vcsa)
merged_vcsa_builds.reset_index(drop=True, inplace=True)
merged_vcenter_tables["vcsa_builds"] = merged_vcsa_builds
# Merge vCenter for Windows tables
merged_windows_builds = vc67_win.append(vc65le_win)
merged_windows_builds.reset_index(drop=True, inplace=True)
merged_vcenter_tables["windows_vc_builds"] = merged_windows_builds
# Merge both tables
merged_vc_all_builds = merged_vcsa_builds.append(merged_windows_builds)
merged_vc_all_builds.reset_index(drop=True, inplace=True)
merged_vcenter_tables["all_vcenter_builds"] = merged_vc_all_builds
# Return the list
return merged_vcenter_tables
# vRA releases
class Kb2143850(KbData):
def __init__(self, kb_id):
super().__init__(kb_id)
self.list_of_dframes = self.parse_releasedata()
def parse_releasedata(self):
"""Accepts the html data for product releases from the KB article for parsing with pandas."""
df = pd.read_html(self.raw_html_resolution, flavor="bs4")
# Contains a list of all tables converted to dataframes in the resolution section
list_of_release_df = []
for table_id in range(len(df)):
if table_id == 0:
# The HTML table have no header, we need to reassign the first row as heading
df_header = df[table_id][:1]
current_df = df[table_id][1:]
current_df.columns = df_header.values.tolist()[0]
# Moving the del up here
del df_header
current_df = self.transform_kb2143850(current_df)
# Get the data types right, especially the date format='%m/%d/%Y'
current_df["Release Date"] = pd.to_datetime(current_df["Release Date"], infer_datetime_format=True,
errors='coerce')
list_of_release_df.append(current_df)
else:
print("Unknown table added, please add handling")
return list_of_release_df
def transform_kb2143850(self, dataframe):
"""Special handling of KB2143850 (vRA)"""
if r"Build Number - Version" in dataframe:
dataframe[r"Build Number - Version"] = dataframe[r"Build Number - Version"].str.normalize("NFKD")
dataframe[["Build Number", "Version"]] = dataframe[r"Build Number - Version"].str.split(r" - ", expand=True)
return dataframe
# vxrail releases
class Kb52075(KbData):
def __init__(self, kb_id):
super().__init__(kb_id)
self.list_of_dframes = self.parse_releasedata()
def get_first_product_name(self):
"""Overriding function with hardcoded value as it isn't in the meta section"""
return "Dell VxRAIL"
def parse_releasedata(self):
"""Accepts the html data for product releases from the KB article for parsing with pandas."""
df = pd.read_html(self.raw_html_resolution, flavor="bs4")
# Contains a list of all tables converted to dataframes in the resolution section
list_of_release_df = []
for table_id in range(len(df)):
if table_id == 0:
# The HTML table have no header, we need to reassign the first row as heading
df_header = df[table_id][:1]
# The header has more than one column header, let's merge. I am not even attempting to automate this.
df_header.at[0, 6] = "External vCSA - Min"
df_header.at[0, 7] = "External vCSA - Recommended"
df_header.at[0, 8] = "External vCSA - Max"
current_df = df[table_id][2:]
current_df.columns = df_header.values.tolist()[0]
# Moving the del up here
del df_header
# Normalize unicode with none breaking space in some rows
current_df["Esxi (Version - Build #)"] = current_df["Esxi (Version - Build #)"].str.normalize("NFKD")
# Atomize multi-value columns
current_df[["ESXi version", "ESXi build number"]] = current_df["Esxi (Version - Build #)"].str.split(
pat=r"-", expand=True)
current_df["ESXi version"] = current_df["ESXi version"].str.strip()
current_df["ESXi build number"] = current_df["ESXi build number"].str.strip()
current_df[["VxRail manager version", "VxRail manager build number"]] = current_df[
"VxRail manager"].str.split(
pat=r"-", expand=True)
current_df["VxRail manager build number"] = current_df["VxRail manager build number"].str.strip()
current_df["VxRail manager version"] = current_df["VxRail manager version"].str.strip()
current_df.reset_index(drop=True, inplace=True)
list_of_release_df.append(current_df)
else:
print("Unknown table added, please add handling")
return list_of_release_df