-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathadvisory_skipper.py
137 lines (126 loc) · 6.78 KB
/
advisory_skipper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
from pathlib import Path
from struct import iter_unpack
from lzma import decompress
from base64 import b64decode
from typing import Iterator
# this is a compressed list of fics from the "foalcon advisory," see below for how it is generated
skipblob = """
/Td6WFoAAATm1rRGAgAhARYAAAB0L+Wj4Ao7B7FdAHSARbAjJsC5Wpx3PwrQvYGJeeuF8VXhjOgZ
qNwi9M5lRyAv4gUhZkUr5K88tPrUqBOQISHs1vqqm+RRJytCYZ7g+k0zc6fup1rHZXbquCJOfen7
zlaZ9Xxz4+MNdoy28Ch78h5z4Fhqk6ROHfHoxkfaryEerz3LIVmSyMC807UbSz0tpA+oJp1GNCbU
0tYNoO18RQL//7GNRpl38HSLEA42hWN85cWW5EM3II0pxspLDMgDqyZntWuNL+rymfTe2NKKmBP5
kc0YVIp0e0hcax4htFmYdtAFq+YBTK1D8F+ypslnJk1EojveBEDwaJNbaSozXj++LYMtHgsT1F42
3zrW9zULQX+QIrjcWz2GqtubnW1MTPqW1SgKfOdSf3+n4g904xYip1V9CmNmR1Lz2XoUXKACMSl3
UxzafrrMoJlO/lOD/7QWQmbT4m99RlncGzn9SFfMnmCU8Pj5LxgzWveakJJ9wXmwzT3MlnQAnvwC
tRkReeIz/lvDH2ts6ulIAF1tCU7l49VhzGCCQB9glYak7i6A6JwTfkc+Wrjn1tQ8+YNk5JvXI7JK
qEGzhPdeRCF//f7BBDXvlfIxLX/uI4kT/6mt4WquTY7dhj0kb58DqIdVztke1kxKXfHisxsG/9J1
Lq4GpS/F4D25QS/semIRFipO2uGJZIwEltraFsd+prfroPp8F+SXtqSzfNT2P1/vAgoZv0DdKL+A
xTI7Gf9i8KS0/BEn9b0QLRGbnY12ePrpIU02Ya2bqJahQmd7c04FCqlEIQa84voYStPuyOJps+eZ
tzaxTr6Uw2QMNt0im8yg4NzeZ9gWmbM5HYyp7Xc96pxdUqxXBnIQr4UZlN4DNVc3IS+1cY1eVSsP
zyfyNms3wKRZVTztPl2AoRbbygkdpy4z+Q5XxzVLx0R9NhpZcBAE1VhYeV0KgSowOqjmz1m6q2BS
oIhrcdRJj2gbCTEC7aODaHdwNDikjc6G+0oLzxgEJafe8NcKgY2LPg+1I5cFiS+7SmUMgNhNEEcE
8RKcwG0zMZa4zB6wKN2f1S/hkvSfn242tUjWLzsmce66y70Vuh56dStcjZtO+ZcbE/358AV7FBoG
avDVYGhm/ky+MTD3hv4qvuKA6WRaC9A4h7Zl22JBfvScRfbM2qF++uJdTtjxU1jSu8R9M4fwZs5C
hgQ2hwwm2PWkKnPq6DzPZKu+MsLPFwr40ZN2OlO7x0e/GX2jeWjAAQrvRzqfL2T18XQt/Qg+UZ+0
gguoCv2uWaQEn4cmlQVk64fL1LUmtSWesRUNRyIHgxoHIo21S9rcGZGlaZ3P/ZXcNY8VRiIUsnLt
i7CSbC0dVxbif6vTizlA99ruePGu64FVx4Jl4SSlRql0lEcI4GwPV20XOqABSYeeiOKCTxU87M5b
XBQSw4v+cu1zosyiIsx2LgcCNpHszudSi9CaEBMjNklxY02Hc9Xrjf4QmPFAcmEVnh0VlYC0HSmU
2XU7pX+qoH1mRn8qzXCvfDjIbsbCZefZisy7ACjlYE+qo0dCwRNtg5mmocKURYh4ccpXH2pKNtJ4
oixdPm/LX4Z6jhX87PgQxbUQMMy/ZXoH57ui0au6Ox+oDpUGtzQmhw3UkkqaawSp18lnjsYNLUDe
JndnqODJErQTsUAxYkiiub/Mh56rzMrDNp309pWQfYxnV1KNUvljqKV0Rs8Mp2iZGKuT9Bvdv8d/
4h1zg5VqXIDuTgU0cLjvAfbtgfL7XyPmhKbLzvyU+BvzyP/SUDahvGgIC3OLiI97CIxc/RGv78+H
Y8MnLiOimH9a1Hw0tjBGubrzcciG3RW4bWeplzf1axAjsCQWqAtWyScKs+T2GtdkIxCQv+RZII5l
j1JHYqjcPjnozkYVUslZ0pnon1CTVGuMbMOnJNvT9UymsW3jnqk6eXqtiV++SoTZFH71WOX7kmwn
7wV0FpJkh51k/JOsuZofg+Tj4AjpL4AvlnNbhscTU3PNK6Ga/tLufx9YsU8b3fhjlX7DCtXGTMRZ
Pklco57SIBlR9pHBbJXZMi0leG6aUQoWtVBk0D3Dh35UcblbScykteyoDi+CYb/QZvkdUtFvVzP/
YSz4um2XYLUwUo0Rl3YibkyGeVUbYxbQZSY2R1ura+ApuI/yTH3KX22hqfnHlioFcTprngkqMh+S
TOA0wvtG7GICi/AD56ABax+ReKQMWqsmpGHUH9RAZTdPPsq2zcRBJovaTvI/OnDNV4+KmuLJQOHp
5s89oiu83gczf2VzcXWdA44PRIbYvkaQ5LNn5x+bCUnsKOyytbnqq4xsf3UpTVa3wxtzCKoMIqgA
inXfXa7GDuhV4Zb99wvMJiiIOaSlHCWoBGjQ50NVW8Gi+9MnQbcur6BkVhkFpMpzxKzMOudSJ+Kc
sSb6/o9mw5Mfk18RVWRhFcOqoInJ6hWZ12oJRXsjm0st6zJeRZVy4C8fG+Mfd3umN1gAneWjzB4B
Xe86HzkaDe5o7tuAIh4rdJ4KZrfrK9unyPxdUeLWlhuY9TzNII656ct8kZMt8IfVVpstT+RletQb
rWqamo7t52lhJbiT3jyRYdWxZgk9FCN6Z3NPInM99PM9HTLmANpssHNFjY9BWHOa/jpDwhY/DKoD
ijAoAgAAAAAA4y0iUMJU3oMAAc0PvBQAAOi0+KCxxGf7AgAAAAAEWVo="""
def generate_skips(blob: str = skipblob) -> Iterator[int]:
compressed_blob_b64 = blob.encode("ascii")
compressed_blob = b64decode(compressed_blob_b64)
bytestream = decompress(compressed_blob)
encoded_skips = iter_unpack("<H", bytestream)
first_id = next(encoded_skips)[0]
yield first_id
for adds in encoded_skips:
first_id += adds[0]
yield first_id
if __name__ == "__main__":
"""
In order for this code to work correctly, you need to have imported the fimfarchive without skipping any stories,
especially without the Advisory "magic" tag. The script reads the advisory from disk, but it can be saved from,
e.g.: https://web.archive.org/web/20240419223159/https://fimfetch.net/foalcon-advisory
Then it searches for the author ID - title pairs, with a little allowance for fuzziness
Any found pairs are stored as a list of story IDs, and then transformed into a blob which can be pasted in code
"""
from tomllib import loads
from itertools import pairwise
from struct import pack
from lzma import compress
from base64 import encodebytes
from bs4 import BeautifulSoup
from esdocs import Story
from elasticsearch_dsl import connections
from Levenshtein import ratio #pip install levenshtein
my_config_path = Path(__file__).with_suffix(".ini")
my_config = loads(my_config_path.read_text())
"""
example advisory_skipper.ini:
advisory_html = "foalcon-advisory.html"
[elasticsearch]
hosts = [ "https://some-host:9200" ]
ca_cert = "/your/http_ca.crt"
username = "elasticsearch reader username"
password = "elasticsearch reader password"
"""
connections.create_connection(hosts=my_config["elasticsearch"]["hosts"],
ca_certs=my_config["elasticsearch"]["ca_cert"],
basic_auth=(my_config["elasticsearch"]["username"], my_config["elasticsearch"]["password"]))
advisory = Path(my_config["advisory_html"])
soup = BeautifulSoup(advisory.read_text(), "lxml-xml")
fimfiction_section = soup.find_all("table", "userlist")[0]
ids_found = []
failures = []
story_search = Story._index.search().source(includes=["title", "id"])
for row in fimfiction_section.tbody.find_all("tr"):
columns = row.find_all("td")
author_id = int(columns[0].text)
search_author = story_search.filter("term", author__id=author_id)
for story in columns[2].find_all("span", "title"):
story_title = story.text
title_query = search_author.query("match", title=story_title)
res = title_query.execute()
if not res.hits.total.value:
failures.append({"id": author_id, "title": story_title, "hits": []})
continue
found = False
for hit in res.hits.hits:
# find about a dozen more fics due to title differences, especially unicode and special characters that break exact matches
closeness = ratio(story_title, hit._source["title"], score_cutoff=0.8)
if closeness > 0.8:
ids_found.append(hit._source["id"])
found = True
# if closeness < 1:
# print(f'Searched: {story_title}, Matched: {hit._source["title"]}, ratio: {closeness}')
break
if not found:
# manual review - 298 not found out of 1608 fics in the list, looks like only a few are missed. examples:
# possible failure "An Afternoon of Discovery" -> "An Evening of Discovery"
# likely success: "When Things Change [Deleted]" -> "When Things Change (Deleted Scenes)"
# probably the 298 failures actually deleted and not in fimfarchive. good enough
failures.append({"id": author_id, "title": story_title, "hits": [hit._source["title"] for hit in res.hits.hits]})
ids_found.sort() # thanks to the FiMFarchive for being ordered sequentially by story ID for this trick
first = ids_found[0]
deltas = list(map(lambda pair: pair[1] - pair[0], pairwise(ids_found)))
compress_this = [first]
compress_this.extend(deltas)
blob = pack(f"<{len(compress_this)}H", *compress_this)
compressed_blob = compress(blob)
b64 = encodebytes(compressed_blob).decode("ascii")
print(b64)