Skip to content

Commit

Permalink
Merge pull request #6 from internetarchive/webcomponent
Browse files Browse the repository at this point in the history
Overhaul of the WebComponent
  • Loading branch information
ibnesayeed authored Jun 17, 2022
2 parents f5e3aee + 97407e7 commit cb5184e
Show file tree
Hide file tree
Showing 7 changed files with 1,545 additions and 34 deletions.
18 changes: 16 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,12 @@ optional arguments:

## Sample Output

```
$ cdxsummary sample.cdx.gz
### Plain Text Summary

<details>
<summary>$ cdxsummary sample.cdx.gz</summary>

```
CDX Overview
────────────────────────────────────
Total Captures in CDX 74,460
Expand Down Expand Up @@ -154,6 +157,12 @@ $ cdxsummary sample.cdx.gz
* https://web.archive.org/web/20210318000510/https://roundme.com/embed/ro6VYzBNE5vePdZ3xyph
* https://web.archive.org/web/20210318000510/https://prevention.cancer.gov/news-and-events/videos-and-webinars
```
</details>

### JSON Summary

<details>
<summary>$ cdxsummary --json sample.cdx.gz</summary>

```
$ cdxsummary --json sample.cdx.gz
Expand Down Expand Up @@ -379,3 +388,8 @@ $ cdxsummary --json sample.cdx.gz
]
}
```
</details>

## Testing

An [interactive test interface](https://internetarchive.github.io/cdx-summary/webcomponent/) is available for the Web Component that renders the JSON summary.
25 changes: 25 additions & 0 deletions webcomponent/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,28 @@ Alternatively, render a summary from a `web` collection/item of Internet Archive
```html
<cdx-summary item="PETABOX_ITEM_OR_COLLECTION_ID"></cdx-summary>
```

One of the `src` and `item` attributes is mandatory for the element to render.

By default, sample capture playback links (i.e., memento URIs or URI-Ms) point to `https://web.archive.org/web/`, but this can be customized by specifying the `playback` attribute.
To control the maximum number of thumbnails of random sample captures (rendered by embedding them in iframes), specify a positive integer in the `thumbs` attribute.
The list of random sample capture playback URIs are hidden by default, but can be expanded by setting the `drawer` attribute to `open`.
The element accepts a `name` attribute, which defaults to the name of the summary file (without extensions).
A `type` attribute can be used to customize textual descriptions with a value of `collection`, `item`, or `CDX` (defaults to `CDX`).
A `report` attribute points to a comprehensive version of the summary file, which is derived from the `item` attribute for Petabox items/collections, unless specified explicitly.
Number reported in various table cells can be formatted using `format` attribute with values `short`, `percent`, or `local` (defaults to `local`).
The code below illustrates the usage of these attributes.

```html
<cdx-summary src="https://example.org/files/covid-collection-cdx.summary.json"
report="https://example.org/files/covid-collection-cdx.report.json.gz"
type="collection"
name="COVID-19 Collection"
format="short"
thumbs="10"
playback="https://archive.example.com/memento/"
drawer="open">
</cdx-summary>
```

An [interactive test interface](https://internetarchive.github.io/cdx-summary/webcomponent/) is available for the Web Component that renders the JSON summary.
171 changes: 140 additions & 31 deletions webcomponent/cdxsummary.js
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,23 @@ export class CDXSummary extends HTMLElement {
}

toFs(s) {
let i = Math.floor(Math.log(s)/Math.log(1024));
return (s/Math.pow(1024, i)).toFixed(2) * 1 + ' ' + ['B', 'kB', 'MB', 'GB', 'TB'][i];
let i = Math.log(s)/Math.log(1024) | 0;
return `${Number((s/Math.pow(1024, i)).toFixed(2))} ${['B', 'KB', 'MB', 'GB', 'TB'][i]}`;
}

toSn(n) {
let i = Math.log10(n)/3 | 0;
return `${Number((n/Math.pow(1000, i)).toFixed(1))}${['', 'K', 'M', 'B', 'T'][i]}`;
}

toDate(dt) {
return `${dt.substring(0, 4)}-${dt.substring(4, 6)}-${dt.substring(6, 8)}`;
}

toMonth(m) {
return (new Date(1970, m-1, 15)).toLocaleDateString('default', {month: 'short'});
}

toNum(n) {
return new Intl.NumberFormat().format(n);
}
Expand All @@ -26,14 +35,18 @@ export class CDXSummary extends HTMLElement {
return `${(n*100/this.data.captures).toFixed(2)}%`;
}

numCell(n) {
return `<td title="${this.toPerc(n)}">${this.toNum(n)}</td>`;
numCell(n, h='') {
return `<td title="${[h, this.toPerc(n), this.toSn(n), this.toNum(n)].filter(i => i).join('\n')}">${this.formatter(n)}</td>`;
}

numSum(nums) {
return nums.reduce((s, v) => s + v, 0);
}

urim(dt, urir, mod='') {
return `${this.playback}/${dt}${mod}/${urir}`;
}

overviewTable() {
return `
<table>
Expand Down Expand Up @@ -67,8 +80,8 @@ export class CDXSummary extends HTMLElement {
`;
}

gridTable(obj, group='', cols) {
if (!cols) {
gridTable(obj, group='', cols=[], format=c=>c) {
if (!cols.length) {
cols = Object.keys(Object.values(obj)[0]);
}
const colSum = cols.reduce((a, k) => {a[k] = 0; return a}, {});
Expand All @@ -77,7 +90,7 @@ export class CDXSummary extends HTMLElement {
<thead>
<tr>
<th scope="col">${group}</th>
${cols.map(c => `<th scope="col">${c}</th>`).join('\n')}
${cols.map(c => `<th scope="col">${format(c)}</th>`).join('\n')}
<th scope="col">TOTAL</th>
</tr>
</thead>
Expand All @@ -88,18 +101,18 @@ export class CDXSummary extends HTMLElement {
<th scope="row">${o[0]}</th>
${cols.map(c => {
colSum[c] += o[1][c];
return this.numCell(o[1][c]);
return this.numCell(o[1][c], `${o[0]} - ${format(c)}`);
}).join('\n')}
${this.numCell(this.numSum(Object.values(o[1])))}
${this.numCell(this.numSum(Object.values(o[1])), o[0])}
</tr>`
}).join('\n')}
</tbody>
${Object.keys(obj).length > 1 && `
<tfoot>
<tr>
<th scope="col">TOTAL</th>
${cols.map(c => this.numCell(colSum[c])).join('\n')}
${this.numCell(this.data.captures)}
${cols.map(c => this.numCell(colSum[c], format(c))).join('\n')}
${this.numCell(this.data.captures, 'ALL')}
</tr>
</tfoot>`}
</table>
Expand All @@ -122,15 +135,15 @@ export class CDXSummary extends HTMLElement {
return `
<tr>
<th scope="row">${h[0]}</th>
${this.numCell(h[1])}
${this.numCell(h[1], h[0])}
</tr>`
}).join('\n')}
</tbody>
${otherHostsCount > 0 && `
<tfoot>
<tr>
<th scope="row">OTHERS (${this.toNum(otherHostsCount)} Hosts)</th>
${this.numCell(otherHostsTotal)}
${this.numCell(otherHostsTotal, 'OTHERS')}
</tr>
</tfoot>`}
</table>
Expand All @@ -139,9 +152,32 @@ export class CDXSummary extends HTMLElement {

sampleCapturesList() {
return `
<details ${this.drawer} class="samples">
<summary data-open="Hide Sample URIs" data-close="Show ${this.data.samples.length} Random Sample URIs"></summary>
<ul>
${this.data.samples.map(s => s.concat(s[1].replace(/^(https?:\/\/)?(www\.)?/i, ''))).sort((a, b) => a[2].length - b[2].length).map(s => `<li><a href="${this.WAYBACK}${s[0]}/${s[1]}">${s[2]}</a></li>`).join('\n')}
${this.data.samples.map(s => s.concat(s[1].replace(/^(https?:\/\/)?(www\.)?/i, ''))).sort((a, b) => a[2].length - b[2].length).map(s => `<li><a href="${this.urim(s[0], s[1])}">${s[2]}</a></li>`).join('\n')}
</ul>
</details>
`;
}

sampleThumbs() {
let s = this.data.samples;
const ridx = new Set();
while (ridx.size < Math.min(this.thumbs, s.length)) {
ridx.add(Math.floor(Math.random()*s.length));
}
return `
<div class="sample-thumbs">
${[...ridx].map(i => `
<div class="thumb-container">
<div class="thumb">
<a href="${this.urim(s[i][0], s[i][1])}">${s[i][1]}</a>
<iframe src="${this.urim(s[i][0], s[i][1], 'if_')}" sandbox="allow-same-origin allow-scripts" scrolling="no" frameborder="0" onload="this.style.backgroundColor='white'"></iframe>
</div>
</div>
`).join('\n')}
</div>
`;
}

Expand All @@ -154,55 +190,57 @@ ${this.data.samples.map(s => s.concat(s[1].replace(/^(https?:\/\/)?(www\.)?/i, '

container.innerHTML = `
<p>
The summary below is based on the <a href="${this.src}">Item/Collection Summary JSON</a> file.
A more comprehensive <a href="${this.src.replace(/\.summary\.json$/, '.report.json.gz')}">Item/Collection Report</a> file is available for detailed analysis and research.
The summary of the <i>${this.name}</i> ${this.type} below is based on the <a href="${this.src}">summary JSON file</a>.
${this.report ? `A more comprehensive <a href="${this.report}">report JSON file</a> is available for detailed analysis and research.` : ''}
</p>
<p class="info">
Hover over on numeric cells to see the percentage value in the tooltip w.r.t. the total number of captures.
Hover over on numeric cells to see the values in various formats in the tooltip (percentage values are w.r.t. the total number of captures).
Insignificant values might be reported as <code>0.00%</code>.
</p>
<h2>CDX Overview</h2>
<h2>Overview</h2>
<p>
This overview is based on the sorted unique capture index (CDX) file of all the WARC files in the item/collection.
This overview is based on the sorted unique capture index (CDX) file of all the WARC files in the ${this.type}.
The <code>Total WARC Records Size</code> value is neither the combined size of the WARC files nor the sum of the sizes of the archived resources, instead, it is the sum of the sizes of the compressed WARC Response records (including their headers).
</p>
${this.overviewTable()}
<h2>MIME Type and Status Code Distribution</h2>
<h2>MIME Type and Status Code</h2>
<p>
The grid below shows HTTP status code groups of captures of various media types in this item/collection.
The <code>Revisit</code> records do not represents an independent media type, instead, they reflect an unchanged state of representations of resources from some of their prior observations (i.e., the same content digest for the same URI).
The matrix below shows HTTP status code groups of captures of various media types in this ${this.type}.
The <code>Revisit</code> records do not represent an independent media type, instead, they reflect an unchanged state of representations of resources from some of their prior observations (i.e., the same content digest for the same URI).
The <code>TOTAL</code> column shows combined counts for each media type irrespective of their HTTP status code and the <code>TOTAL</code> row (displayed only if there are more than one media types listed) shows the combined counts of each HTTP status code group irrespective of their media types.
</p>
${this.gridTable(this.data.mimestatus, 'MIME')}
<h2>Path and Query Segments</h2>
<h2>Path Segment and Query Parameter</h2>
<p>
The grid below shows the number of path segments and the number of query parameters of various URIs in this item/collection.
The matrix below shows the number of path segments and the number of query parameters of various URIs in this ${this.type}.
For example, the cell <code>P0</code> and <code>Q0</code> shows the number of captures of homepages of various hosts with zero path segments and zero query parameters.
The URI <code>https://example.com/img/logo.png?width=300&height=100&screen=highres</code> has two path segments (i.e., <code>/img/logo.png</code>) and three query parameters (i.e., <code>width=300&height=100&screen=highres</code>), hence counted under the <code>P2</code> and <code>Q3</code> cell.
The <code>TOTAL</code> column shows combined counts for URIs with a specific number of path segments irrespective of their number of query parameters and the <code>TOTAL</code> row (displayed only if there are URIs with a varying number of path segments) shows the combined counts for URIs with a specific number of query parameters irrespective of their number of path segments.
</p>
${this.gridTable(this.data.pathquery, 'Path')}
<h2>Year and Month Distribution</h2>
<h2>Year and Month</h2>
<p>
The grid below shows the number of captures of this item/collection observed in different calendar years and months.
The matrix below shows the number of captures of this ${this.type} observed in different calendar years and months.
The <code>TOTAL</code> column shows combined counts for corresponding years and the <code>TOTAL</code> row (displayed only if the captures were observed across multiple calendar years) shows the combined number of captures observed in the corresponding calendar months irrespective of their years.
</p>
${this.gridTable(this.data.yearmonth, 'Year', Object.keys(Object.values(this.data.yearmonth)[0]).sort())}
${this.gridTable(this.data.yearmonth, 'Year', Object.keys(Object.values(this.data.yearmonth)[0]).sort(), this.toMonth)}
<h2>Top <i>${this.toNum(Object.keys(this.data.tophosts).length)}</i> Out of <i>${this.toNum(this.data.hosts)}</i> Hosts</h2>
<p>
The table below shows the top hosts of this item/collection based on the number of captures of URIs from each host.
The table below shows the top hosts of this ${this.type} based on the number of captures of URIs from each host.
The <code>OTHERS</code> row, if present, is the sum of the longtail of hosts.
</p>
${this.topHostsTable()}
<h2><i>${this.data.samples.length}</i> Random Samples of <i>OK HTML</i> Captures</h2>
<h2>Random HTML Capture Samples</h2>
${this.sampleThumbs()}
<p>
Below is a list of random sample of captured URIs linked to their corresponding Wayback Machine playback URIs from this item/collection.
Below is a list of random sample of captured URIs linked to their corresponding Wayback Machine playback URIs from this ${this.type}.
The sample is chosen only from captures that were observed with the <code>text/html</code> media type and <code>200 OK</code> HTTP status code.
Any unexpected URIs listed below (e.g., with a <code>.png/.jpg/.pdf</code> file extension) are likely a result of the Soft-404 issue from the origin server.
</p>
Expand All @@ -220,11 +258,25 @@ ${this.sampleCapturesList()}
}

async connectedCallback() {
this.playback = (this.getAttribute('playback') || this.WAYBACK).replace(/\/+$/, '');
this.thumbs = ((parseInt(this.getAttribute('thumbs'))+1) || 5)-1;
this.format = this.getAttribute('format') || 'local';
this.formatter = (this.format == 'short') ? this.toSn : (this.format == 'percent') ? this.toPerc : this.toNum
this.drawer = this.getAttribute('drawer') || '';
this.type = this.getAttribute('type') || 'CDX';
this.name = this.getAttribute('name') || '';
this.report = this.getAttribute('report') || '';
this.src = this.getAttribute('src') || '';
this.item = this.getAttribute('item') || '';
if(this.item && !this.src) {
this.src = `${this.PETABOX}${this.item}/${this.item}.summary.json`;
}
if(this.item && !this.report) {
this.report = `${this.PETABOX}${this.item}/${this.item}.report.json.gz`;
}
if(!this.name) {
this.name = this.src.split('/').pop().replace(/(.summary)?.json$/, '');
}
this.data['msg'] = this.src ? 'Loading summary...' : 'Either "src" or "item" attribute is required for the &lt;cdx-summary&gt; element!';

this.shadow.innerHTML = `
Expand All @@ -233,8 +285,13 @@ ${this.sampleCapturesList()}
padding: 5px;
font-family: "Helvetica Neue",Helvetica,Arial,sans-serif;
}
table {
table {
border-collapse: collapse;
display: block;
max-width: fit-content;
margin: 0 auto;
overflow-x: auto;
white-space: nowrap;
}
tr:nth-child(even), li:nth-child(even) {
background-color: #eee;
Expand Down Expand Up @@ -273,6 +330,58 @@ ${this.sampleCapturesList()}
.info::before {
content: "🛈 ";
}
summary {
cursor: pointer;
}
details {
margin-bottom: 20px;
}
details.samples[open] summary::after {
content: attr(data-open);
}
details.samples:not([open]) summary::after {
content: attr(data-close);
}
.sample-thumbs {
text-align: center;
}
.thumb-container {
width: 294px;
height: 186px;
display: inline-block;
overflow: hidden;
position: relative;
}
.thumb {
width: 288px;
height: 180px;
border: 1px solid #333;
border-radius: 4px;
padding: 2px;
background-color: #fff;
background-image: url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 58 58' style='enable-background:new 0 0 58 58' xml:space='preserve'%3E%3Cpath fill='lightgray' d='M31 56h24V32H31v24zm2-22h9v12.586l-4.293-4.293-1.414 1.414L43 50.414l6.707-6.707-1.414-1.414L44 46.586V34h9v20H33V34zM21.569 13.569C21.569 10.498 19.071 8 16 8s-5.569 2.498-5.569 5.569c0 3.07 2.498 5.568 5.569 5.568s5.569-2.497 5.569-5.568zm-9.138 0C12.431 11.602 14.032 10 16 10s3.569 1.602 3.569 3.569-1.601 3.569-3.569 3.569-3.569-1.601-3.569-3.569zM6.25 36.661a.997.997 0 0 0 1.41.09l16.313-14.362 7.319 7.318a.999.999 0 1 0 1.414-1.414l-1.825-1.824 9.181-10.054 11.261 10.323a1 1 0 0 0 1.351-1.475l-12-11a1.002 1.002 0 0 0-1.414.063l-9.794 10.727-4.743-4.743a1.003 1.003 0 0 0-1.368-.044L6.339 35.249a1 1 0 0 0-.089 1.412z'/%3E%3Cpath fill='lightgray' d='M57 2H1a1 1 0 0 0-1 1v44a1 1 0 0 0 1 1h24a1 1 0 1 0 0-2H2V4h54v25a1 1 0 1 0 2 0V3a1 1 0 0 0-1-1z'/%3E%3C/svg%3E");
background-position: center;
background-repeat: no-repeat;
background-size: 30%;
}
.thumb a {
display: block;
position: absolute;
z-index: 2;
inset: 0;
color: #fff;
background: #fff;
opacity: 0;
}
.thumb iframe {
overflow: hidden;
position: relative;
z-index: 1;
width: 960px;
height: 600px;
transform-origin: 0 0;
transform: scale(0.3, 0.3);
}
</style>
<div id="container">
</div>
Expand Down
Loading

0 comments on commit cb5184e

Please sign in to comment.