diff --git a/Dockerfile b/Dockerfile index 8ed0bdd0..f57b4625 100644 --- a/Dockerfile +++ b/Dockerfile @@ -6,6 +6,21 @@ FROM alpine:3.10 RUN apk update RUN apk add git +# The base alpine find command is quite +# limited. We need full featured find. +RUN apk add findutils + +# We also need coreutils to get fuller +# featured versions of shell commands, +# such as sort. +RUN apk add coreutils + +# We also need gawk +RUN apk add gawk + +# Let's use bash +RUN apk add bash bash-doc bash-completion + COPY LICENSE README.md / COPY entrypoint.sh /entrypoint.sh ENTRYPOINT ["/entrypoint.sh"] diff --git a/README.md b/README.md index 21c0e64d..518fba3a 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Generate Sitemap +# generate-sitemap [![build](https://github.com/cicirello/generate-sitemap/workflows/build/badge.svg)](https://github.com/cicirello/generate-sitemap/actions?query=workflow%3Abuild) [![GitHub](https://img.shields.io/github/license/cicirello/generate-sitemap)](https://github.com/cicirello/generate-sitemap/blob/master/LICENSE) @@ -11,7 +11,14 @@ html as well as pdf files in the sitemap, and has inputs to control the included file types (defaults include both html and pdf files in the sitemap). It skips over html files that contain ``. It otherwise -does not currently attempt to respect a robots.txt file. +does not currently attempt to respect a robots.txt file. The +sitemap entries are sorted in a consistent order. Specifically, +all html pages appear prior to all URLs to pdf files (if pdfs +are included). The html pages are then first sorted by depth +in the directory structure (i.e., pages at the website root +appear first, etc), and then pages at the same depth are sorted +alphabetically. URLs to pdf files are sorted in the same manner +as the html pages. It is designed to be used in combination with other GitHub Actions. For example, it does not commit and push the generated @@ -101,7 +108,7 @@ file in the root of the repository. After completion, it then simply echos the outputs. ```yml -name: Generate API sitemap +name: Generate xml sitemap on: push: @@ -119,7 +126,7 @@ jobs: fetch-depth: 0 - name: Generate the sitemap id: sitemap - uses: cicirello/generate-sitemap@v1.0.0 + uses: cicirello/generate-sitemap@v1.1.0 with: base-url-path: https://THE.URL.TO.YOUR.PAGE/ - name: Output stats @@ -155,7 +162,7 @@ jobs: fetch-depth: 0 - name: Generate the sitemap id: sitemap - uses: cicirello/generate-sitemap@v1.0.0 + uses: cicirello/generate-sitemap@v1.1.0 with: base-url-path: https://THE.URL.TO.YOUR.PAGE/ path-to-root: docs @@ -178,7 +185,7 @@ then the `peter-evans/create-pull-request` monitors for changes, and if the sitemap changed will create a pull request. ```yml -name: Generate API sitemap +name: Generate xml sitemap on: push: @@ -196,7 +203,7 @@ jobs: fetch-depth: 0 - name: Generate the sitemap id: sitemap - uses: cicirello/generate-sitemap@v1.0.0 + uses: cicirello/generate-sitemap@v1.1.0 with: base-url-path: https://THE.URL.TO.YOUR.PAGE/ - name: Create Pull Request diff --git a/entrypoint.sh b/entrypoint.sh index 365ee3b7..9601fbb1 100755 --- a/entrypoint.sh +++ b/entrypoint.sh @@ -1,4 +1,4 @@ -#!/bin/sh -l +#!/bin/bash -l websiteRoot=$1 baseUrl=$2 @@ -11,12 +11,9 @@ skipCount=0 function formatSitemapEntry { if [ "$sitemapFormat" == "xml" ]; then - lastModDate=${3/ /T} - lastModDate=${lastModDate/ /} - lastModDate="${lastModDate:0:22}:${lastModDate:22:2}" echo "" >> sitemap.xml echo "$2${1%index.html}" >> sitemap.xml - echo "$lastModDate" >> sitemap.xml + echo "$3" >> sitemap.xml echo "" >> sitemap.xml else echo "$2${1/%\/index.html/\/}" >> sitemap.txt @@ -35,20 +32,20 @@ else fi if [ "$includeHTML" == "true" ]; then - for i in $(find . \( -name '*.html' -o -name '*.htm' \) -type f); do - if [ "0" == $(grep -i -c -E "