Skip to content

Commit

Permalink
Multiple sitemaps with articles
Browse files Browse the repository at this point in the history
  • Loading branch information
RK206 committed Oct 31, 2024
1 parent 971f61d commit b076f3c
Showing 1 changed file with 146 additions and 72 deletions.
218 changes: 146 additions & 72 deletions portality/bll/services/site.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import re
import os
from datetime import datetime

from lxml import etree
Expand All @@ -13,114 +14,187 @@
from portality.util import get_full_url_safe

NS = "{http://www.sitemaps.org/schemas/sitemap/0.9}"


def create_simple_sub_element(parent, element_name, text=None):
""" create and attach simple text element to argument *parent*
"""
loc = etree.SubElement(parent, NS + element_name)
if text is not None:
loc.text = text
return loc


def create_url_element(parent, loc, change_freq, lastmod=None):
""" create and attach url element to argument *parent*
"""
url_ele = etree.SubElement(parent, NS + "url")

create_simple_sub_element(url_ele, 'loc', loc)
if lastmod is not None:
create_simple_sub_element(url_ele, "lastmod", lastmod)
create_simple_sub_element(url_ele, "changefreq", change_freq)

return url_ele

IN_DOAJ = {
"query": {
"bool": {
"must": [
{"term": {"admin.in_doaj": True}}
]
}
}
}
NMSP = "http://www.sitemaps.org/schemas/sitemap/0.9"
MAX_FILE_SIZE = (49 * 1024 * 1024)
MAX_URL_COUNT = 49000

class SitemapGenerator:

def __init__(self, filename_prefix, temp_store, main_store, container_id):
self.file_idx = 0
self.url_count = 0
self.current_file_path = None
self.current_filename = None
self.file = None
self.sitemap_files = []
self.filename_prefix = filename_prefix
self.temp_store = temp_store
self.main_store = main_store
self.container_id = container_id
self.change_freq = app.config.get("TOC_CHANGEFREQ", "monthly")
self.create_sitemap_file()

def add_url(self, url, lastmod=None):
self.write_url_element(url, lastmod=lastmod)
self.check_and_finalize_file()
self.url_count += 1

def write_url_element(self, loc, lastmod=None):
url_ele = f"""
<url>
<loc>{loc}</loc>
<changefreq>{self.change_freq}</changefreq>"""
if lastmod is not None:
url_ele += f"\n <lastmod>{lastmod}</lastmod>"
url_ele += "\n </url>"
self.file.write(url_ele)

def create_sitemap_file(self):
self.current_filename = f'{self.filename_prefix}_{self.file_idx}_utf8.xml'
self.current_file_path = os.path.join(self.temp_store, self.current_filename)
self.file = open(self.current_file_path, "w")
self.file.write('<?xml version="1.0" encoding="UTF-8"?>\n')
self.file.write('<urlset xmlns="'+NMSP+'">')
self.file_idx += 1

def finalize_sitemap_file(self):
self.file.write('\n</urlset>\n')
self.file.close()
self.main_store.store(self.container_id, self.current_filename, source_path=self.current_file_path)
self.sitemap_files.append(self.main_store.url(self.container_id, self.current_filename))

def check_and_finalize_file(self):
file_size = os.path.getsize(self.current_file_path)
if file_size >= MAX_FILE_SIZE or self.url_count >= MAX_URL_COUNT:
self.finalize_sitemap_file()
self.create_sitemap_file()
self.url_count = 0

def get_url_count(self):
return self.url_count

def get_sitemap_files(self):
return self.sitemap_files

class SiteService(object):

def sitemap(self, prune: bool = True):
"""
Generate the sitemap
~~Sitemap:Feature~~
:return:
"""
# first validate the incoming arguments to ensure that we've got the right thing
argvalidate("csv", [
{"arg": prune, "allow_none": False, "arg_name": "prune"}
], exceptions.ArgumentException)

action_register = []

base_url = app.config.get("BASE_URL")
if not base_url.endswith("/"):
base_url += "/"

# ~~-> FileStoreTemp:Feature~~
filename = 'sitemap__doaj_' + dates.now_str(FMT_DATETIME_SHORT) + '_utf8.xml'
container_id = app.config.get("STORE_CACHE_CONTAINER")
tmpStore = StoreFactory.tmp()
out = tmpStore.path(container_id, filename, create_container=True, must_exist=False)
run_start_time = dates.now_str(FMT_DATETIME_SHORT)

toc_changefreq = app.config.get("TOC_CHANGEFREQ", "monthly")
filename_prefix = 'sitemap_doaj_' + run_start_time
cache_container_id = app.config.get("STORE_CACHE_CONTAINER")
container_id = os.path.join(cache_container_id,filename_prefix)

NSMAP = {None: "http://www.sitemaps.org/schemas/sitemap/0.9"}
urlset = etree.Element(NS + "urlset", nsmap=NSMAP)
total_static_pages = 0
total_journals_count = 0
total_articles_count = 0

counter = 0
# ~~->FileStore:Feature~~
tmpStore = StoreFactory.tmp()
mainStore = StoreFactory.get("cache")

# do the static pages
# temporary directory
tmp_store_dir = tmpStore.path(container_id, '', create_container=True)
# Create the directories if they don't exist
os.makedirs(tmp_store_dir, exist_ok=True)

sitemap_generator = SitemapGenerator(filename_prefix, tmp_store_dir, mainStore, container_id)

# Generating URLs for static pages
_entries = nav.get_nav_entries()
_routes = nav.yield_all_route(_entries)
_urls = (get_full_url_safe(r) for r in _routes)
_urls = filter(None, _urls)
_urls = set(_urls)
_urls = sorted(_urls)

#static pages
for u in _urls:
create_url_element(urlset, u, toc_changefreq)
counter += 1
sitemap_generator.add_url(u)
total_static_pages += 1

# do all the journal ToCs and articles
# Generating URLs for journals and articles
for j in models.Journal.all_in_doaj():
# first create an entry purely for the journal
toc_loc = base_url + "toc/" + j.toc_id
sitemap_generator.add_url(toc_loc, lastmod=j.last_updated)
toc_art_loc = base_url + "toc/" + j.toc_id + "/articles"
create_url_element(urlset, toc_loc, toc_changefreq, lastmod=j.last_updated)
create_url_element(urlset, toc_art_loc, toc_changefreq)
counter += 2

# log to the screen
action_register.append("{x} urls written to sitemap".format(x=counter))
sitemap_generator.add_url(toc_art_loc)
total_journals_count += 1

# Generating URLs for articles
for a in models.Article.iterate(q=IN_DOAJ, keepalive='5m'):
article_loc = base_url + "article/" + a.id
sitemap_generator.add_url(article_loc, lastmod=a.last_updated)
total_articles_count += 1

if sitemap_generator.get_url_count() > 0:
sitemap_generator.finalize_sitemap_file()

# Create sitemap index file
sitemap_index_filename = f'sitemap_index_doaj_{run_start_time}_utf8.xml'
sitemap_index_path = os.path.join(tmp_store_dir, sitemap_index_filename)
with open(sitemap_index_path, "w") as f:
f.write('<?xml version="1.0" encoding="UTF-8"?>\n')
f.write('<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">\n')
for sitemap_url in sitemap_generator.get_sitemap_files():
f.write(f" <sitemap>\n")
f.write(f" <loc>{sitemap_url}</loc>\n")
f.write(f" <lastmod>{run_start_time}</lastmod>\n")
f.write(f" </sitemap>\n")
f.write('</sitemapindex>\n')

mainStore.store(container_id, sitemap_index_filename, source_path=sitemap_index_path)
index_url = mainStore.url(container_id, sitemap_index_filename)

action_register.append("Sitemap index written to store with url {x}".format(x=index_url))

# Prune old sitemaps if required
if prune:
def sort(filelist):
rx = "sitemap_doaj_(\d{8})_(\d{4})"

# save it into the temp store
tree = etree.ElementTree(urlset)
with open(out, "wb") as f:
tree.write(f, encoding="UTF-8", xml_declaration=True, pretty_print=True)
matched_dates = [
(filename, datetime.strptime(match.groups()[0]+"_"+match.groups()[1], FMT_DATETIME_SHORT))
for filename in filelist
if (match := re.match(rx, filename))
]
return [x for x, _ in sorted(matched_dates, key=lambda x: x[1], reverse=True)]

# ~~->FileStore:Feature~~
mainStore = StoreFactory.get("cache")
try:
mainStore.store(container_id, filename, source_path=out)
url = mainStore.url(container_id, filename)
finally:
tmpStore.delete_file(container_id,
filename) # don't delete the container, just in case someone else is writing to it

action_register.append("Sitemap written to store with url {x}".format(x=url))
def _filter(filename):
return filename.startswith("sitemap_")

# remove all but the two latest sitemaps
if prune:
def sort(filelist):
rx = "sitemap__doaj_(.+?)_utf8.xml"
return sorted(filelist,
key=lambda x: datetime.strptime(re.match(rx, x).groups(1)[0], FMT_DATETIME_SHORT),
reverse=True)
action_register += prune_container(mainStore, cache_container_id, sort, filter=_filter, keep=2)
action_register += prune_container(tmpStore, cache_container_id, sort, filter=_filter, keep=2)

def _filter(filename):
return filename.startswith("sitemap__")
# Update the cache record to point to the new sitemap index and all sitemaps
models.Cache.cache_sitemap(index_url)

action_register += prune_container(mainStore, container_id, sort, filter=_filter, keep=2)
action_register.append(f"Static pages count : {total_static_pages}")
action_register.append(f"Journal URLs count : {total_journals_count}")
action_register.append(f"Article URLs count : {total_articles_count}")

# update the ES record to point to the new file
# ~~->Cache:Feature~~
models.Cache.cache_sitemap(url)
return url, action_register
return index_url, action_register

0 comments on commit b076f3c

Please sign in to comment.