forked from jimbarnesrtp/pf2
-
Notifications
You must be signed in to change notification settings - Fork 0
/
buildDeities.py
236 lines (210 loc) · 10.9 KB
/
buildDeities.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
from bs4 import BeautifulSoup
import requests
import json
import datetime
import codecs
import time
deityHolder = {}
deityHolder['name'] = 'Pathfinder 2.0 deities list'
deityHolder['date'] = datetime.date.today().strftime("%B %d, %Y")
def encoder(words):
encoded = words.replace(u"\u2019", "'")
encoded1 = encoded.replace(u"\u2014", "-")
encoded2 = encoded1.replace(u"\u2011", " ")
encoded3 = encoded2.replace(u'\u201c', "'")
encoded4 = encoded3.replace(u'\u201d', "'")
encoded5 = encoded4.replace(u'\u2026', "...")
encoded6 = encoded5.replace(u'\u2013', "-")
encoded7 = encoded6.replace(u'\u2018', "'")
encoded8 = encoded7.replace(u'\ufb01', "fi")
encoded9 = encoded8.replace(u'\ufb02', "fl")
encoded10 = encoded9.replace(u'\u00d7', "x")
return encoded10
def get_details(link):
itemDetails = {}
res = requests.get(link)
res.raise_for_status()
soup = BeautifulSoup(res.text, 'lxml')
feat = soup.find_all("div", {'class':'main'})
detail = soup.find("span", {'id':'ctl00_MainContent_DetailedOutput'})
children = detail.contents
skipTitle = False
if children[0].find("a").get('href') == "PFS.aspx":#Only some entries have PFS tags, and if they do then the HTML of the source is broken. So we break out of it, and then do handling for any issues this causes later.
children = detail.next_sibling.parent.contents
skipTitle = True
reachedBreak = False
reachedCrit = False
reachedIntercession = False
detailHolder = []
tagType = ""
string = " "
divAbility = []
spellHolder = []
divinterDesc = ""
domainHolder = []
altDomains = []
alreadyHandled = ["Follower Alignments", "Favored Weapon", "Divine Font"]
for child in children:
#print (str(child))
stringContents = str(child)
if stringContents.startswith("<"):
if child.name == "a":
try:
if child['class'][0] == "external-link":
if reachedIntercession == False:
itemDetails['source'] = encoder(str(child.text))
else:
itemDetails['divinterSource'] = encoder(str(child.text))
tagType = ""
except:
pass
if child.name == "b":
tagType = encoder(str(child.text))
if ((child.name == "i") or ((child.name == "a") and ((child.get("style") == "text-decoration:underline") or (child.find('u'))))) and (tagType not in alreadyHandled):
if tagType == "Cleric Spells":
spellHolder.append(encoder(child.text))
elif tagType == "Domains":
domainHolder.append(encoder(child.string))
elif tagType == "Alternate Domains":
altDomains.append(encoder(child.string))
else:
if itemDetails.get(tagType):#If this tag already has something, we fuse the text with the existing something instead of setting it
if child.find("i"):
itemDetails[tagType] = (str(itemDetails[tagType]) + "*" + encoder(str(child.string)) + "*")#Markdown seems like a good way to handle *italcs*.
else:#In this case it's probably a link tag, which we just want the plain text of.
itemDetails[tagType] = (str(itemDetails[tagType]) + encoder(str(child.string)))
elif tagType == "":#If we haven't yet hit a tag, then we assume we're in the main text block, and append it to detailHolder.
if child.name == "i":
detailHolder.append("*" + encoder(str(child.string)) + "*")
else:
detailHolder.append(encoder(str(child.string)))
else:
itemDetails[tagType] = encoder(str(child.string))
if (child.name == "h2") and (child.text == "Divine Intercession"):
reachedIntercession = True
else:
if tagType != "":
if (tagType not in alreadyHandled):
if not stringContents.isspace():
if (tagType == "Divine Ability"):
divAbility.append(encoder(stringContents).strip().split(' or '))
elif (tagType == "Cleric Spells"):
if (child.next_sibling.name != None) and (child.next_sibling.name == 'i'): #Only grabs spells until the next thing isn't in italics (we expect spells to be italicized)
spellHolder.append(encoder(stringContents))
elif reachedIntercession == True:
divinterDesc = divinterDesc + encoder(str(child.string))
elif reachedIntercession == False:#If we're still in the main block, there must be some condition on the spell gained, like Apsu's "(metallic dragons only)"
spellHolder[-1] = str(spellHolder[-1] + encoder(str(child.string)))#...So we fuse it to the most recent spell entry.
else:
print ("Unhandled text: " + str(child))
elif tagType in ("Domains", "Alternate Domains"):#We don't want this text, as it's just commas.
pass
else:
if itemDetails.get(tagType):
itemDetails[tagType] = (str(itemDetails[tagType]) + encoder(str(child.string)))
else:
itemDetails[tagType] = encoder(str(child.string))
else:
if not stringContents.isspace():
if (not detailHolder) and (skipTitle == True):#If there's nothing already in detailHolder, then we might need to skip the deity's title.
detailHolder.append(" ")
elif reachedIntercession == False:
detailHolder.append(encoder(stringContents))
#print(child)
#print('<!!!!!!!!!!!!!!!!!!!!!!!!!>')
spellHolder = [spellEntry.strip(",:").strip() for spellEntry in spellHolder]#strips ,: and whitespace from the spellHolder
spellIter = iter(spellHolder)
spellDict = dict(zip(spellIter, spellIter))#Dict-ifies the spells, output formatted like: {'1st': 'sleep', '4th': 'fly"}
for itemEntry in itemDetails:
itemDetails[itemEntry] = itemDetails[itemEntry].strip(",:").strip()
itemDetails['domains'] = domainHolder
itemDetails['altDomains'] = altDomains
itemDetails['clericSpells'] = spellDict
itemDetails['text'] = "".join(detailHolder).strip(', ').strip()
itemDetails['divinterDesc'] = divinterDesc.strip()
itemDetails['divAbility'] = divAbility
return itemDetails
def get_entry(link):
items = []
multiItems = []
res2 = requests.get(link)
res2.raise_for_status()
soup2 = BeautifulSoup(res2.text, 'lxml')
table = soup2.find(lambda tag: tag.name=='table' and tag.has_attr('id') and tag['id']=="ctl00_MainContent_DeityElement")
rows = table.findAll(lambda tag: tag.name=='tr')
t = 0
for row in rows:
t += 1
#print(row)
#print("-----------------------------------")
item = {}
entries = row.find_all(lambda tag: tag.name=='td')
if entries is not None:
if len(entries) > 0:
if (entries[0].find("a").get('href') == "PFS.aspx"):#Not all entries have PFS tags, so we have to handle them seperately
name = entries[0].find("a").find_next("a").text
link = "https://2e.aonprd.com/" + entries[0].find("a").find_next("a").get('href')
pfsLegal = entries[0].find("a").img.get('alt')
else:
name = entries[0].find("a").text
link = "https://2e.aonprd.com/" + entries[0].find("a").get('href')
pfsLegal = 'No PFS Data'
item['name'] = name
item['link'] = link
item['PFS'] = pfsLegal
item['category'] = entries[1].text.strip()
item['alignment'] = entries[2].text.split(",")
item['followerAlignments'] = entries[3].text.replace(u'\u2014', '').split(", ")
item['font'] = entries[4].text.split("/")
#item['domains'] = entries[5].text.split(", ")#Commented out because we handle this in get_details.
item['weapon'] = entries[6].text.split(" or ")
print("getting details for:",item['name'])
itemDetails = get_details(item['link'])
for key in itemDetails.keys():
item[key] = itemDetails[key]
items.append(item)
#if t > 5:
#break
#Handle Faiths seperately from Deities:
table2 = soup2.find(lambda tag: tag.name=='table' and tag.has_attr('id') and tag['id']=="ctl00_MainContent_FaithElement")
rows2 = table2.findAll(lambda tag: tag.name=='tr')
t = 0
for row in rows2:
t += 1
#print(row)
#print("-----------------------------------")
item = {}
entries = row.find_all(lambda tag: tag.name=='td')
if entries is not None:
if len(entries) > 0:
if (entries[0].find("a").get('href') == "PFS.aspx"):#Not all entries have PFS tags, so we have to handle them seperately
name = entries[0].find("a").find_next("a").text
link = "https://2e.aonprd.com/" + entries[0].find("a").find_next("a").get('href')
pfsLegal = entries[0].find("a").img.get('alt')
else:
name = entries[0].find("a").text
link = "https://2e.aonprd.com/" + entries[0].find("a").get('href')
pfsLegal = 'No PFS Data'
item['name'] = name
item['link'] = link
item['PFS'] = pfsLegal
item['category'] = "Faiths and Philosophies"
item['followerAlignments'] = entries[1].text.replace(u'\u2014', '').split(", ")
#item['domains'] = entries[5].text.split(", ")#Commented out because we handle this in get_details.
print("getting details for:",item['name'])
itemDetails = get_details(item['link'])
for key in itemDetails.keys():
item[key] = itemDetails[key]
items.append(item)
return items
def get_all():
deityHolder['deities'] = get_entry("https://2e.aonprd.com/Deities.aspx")
#deityHolder['rangedWeapons'] = get
return deityHolder
#print(get_all())
json_data = json.dumps(get_all(), indent=4)
#print(json_data)
filename = "deities-pf2.json"
f = open(filename, "w")
f.write(json_data)
f.close