-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
67 lines (54 loc) · 2.44 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import re
import argparse
import os
import glob
import warnings
# Parse arguments (input and output file names)
parser = argparse.ArgumentParser()
parser.add_argument("-i", "--input", type=str,
help="Input file (file containing a list of HTML link tags) or directory (containing multiple .html files to convert)")
parser.add_argument("-o", "--output", type=str,
help="Output file (generated XML Sitemap)")
args = parser.parse_args()
# Default input and output file names
defaultInputFileName = "input.html"
defaultOutputFileName = "output.xml"
# Use default input and output file names if not passed as arguments
inputFileName = args.input if args.input else defaultInputFileName
outputFileName = args.output if args.output else defaultOutputFileName
# If inputFileName is a directory, convert all the html files inside
if os.path.isdir(inputFileName):
warnings.warn("Input is a directory, ignoring output file name.")
inputFileNames = glob.glob(f'{inputFileName}/*.html')
extensionRegex = re.compile('\.html$', re.IGNORECASE)
outputFileNames = [extensionRegex.sub(
".xml", fileName) for fileName in inputFileNames]
else:
inputFileNames = [inputFileName]
outputFileNames = [outputFileName]
# To extract all the html <link> tags and the required attributes
regex = re.compile('\s*<link(?:(?:\s+rel="(?P<rel>.*?)")|(?:\s+hreflang="(?P<hreflang>.*?)")|(?:\s+href="(?P<href>.*?)")|(?:\s+.*?))*?\s*/>',
re.MULTILINE | re.IGNORECASE | re.DOTALL)
for i, inputFileName in enumerate(inputFileNames):
# Read input file
inputFile = open(inputFileName, "r")
text = inputFile.read()
# Extract all the html <link> tags and the required attributes
links = [link.groupdict() for link in regex.finditer(text)]
# Default url hreflang value
DEFAULT_HREFLANG = "x-default"
# Generate XML output
newline = "\n"
alternates = newline.join(
f' <xhtml:link rel="{link["rel"]}" hreflang="{link["hreflang"]}" href="{link["href"]}"/>' for link in links)
output = f"""<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xhtml="http://www.w3.org/1999/xhtml">
{newline.join(f''' <url>
<loc>{link["href"]}</loc>
{alternates}
</url>''' for link in links if link["hreflang"] != DEFAULT_HREFLANG)}
</urlset>
"""
# Write output file
outputFile = open(outputFileNames[i], "w")
outputFile.write(output)