Skip to content

Commit

Permalink
Add table plugin for turndown
Browse files Browse the repository at this point in the history
  • Loading branch information
ryanb-gds committed Sep 26, 2023
1 parent b118e69 commit 711474b
Show file tree
Hide file tree
Showing 4 changed files with 338 additions and 0 deletions.
3 changes: 3 additions & 0 deletions src/html-to-govspeak.js
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
import TurndownService from 'turndown'
import tables from './tables'

const service = new TurndownService({
bulletListMarker: '-',
listIndent: ' ' // 3 spaces
})

service.use(tables)

// define all the elements we want stripped from output
const elementsToRemove = [
'title',
Expand Down
91 changes: 91 additions & 0 deletions src/tables.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
const indexOf = Array.prototype.indexOf
const rules = {}

rules.tableCell = {
filter: ['th', 'td'],
replacement: function (content, node) {
return cell(content, node)
}
}

rules.tableRow = {
filter: 'tr',
replacement: function (content, node) {
let borderCells = ''
const alignMap = { left: ':--', right: '--:', center: ':-:' }

if (isHeadingRow(node)) {
for (let i = 0; i < node.childNodes.length; i++) {
let border = '---'
const align = (
node.childNodes[i].getAttribute('align') || ''
).toLowerCase()

if (align) border = alignMap[align] || border

borderCells += cell(border, node.childNodes[i])
}
}
return '\n' + content + (borderCells ? '\n' + borderCells : '')
}
}

rules.table = {
// Only convert tables with a heading row.
// Tables with no heading row are kept using `keep` (see below).
filter: function (node) {
return node.nodeName === 'TABLE' && isHeadingRow(node.rows[0])
},

replacement: function (content) {
// Ensure there are no blank lines
content = content.replace('\n\n', '\n')
return '\n\n' + content + '\n\n'
}
}

rules.tableSection = {
filter: ['thead', 'tbody', 'tfoot'],
replacement: function (content) {
return content
}
}

// A tr is a heading row if:
// - the parent is a THEAD
// - or if its the first child of the TABLE or the first TBODY (possibly
// following a blank THEAD)
function isHeadingRow (tr) {
const parentNode = tr.parentNode
return (
parentNode.nodeName === 'THEAD' ||
(
parentNode.firstChild === tr &&
(parentNode.nodeName === 'TABLE' || isFirstTbody(parentNode))
)
)
}

function isFirstTbody (element) {
const previousSibling = element.previousSibling
return (
element.nodeName === 'TBODY' && (
!previousSibling ||
(
previousSibling.nodeName === 'THEAD' &&
/^\s*$/i.test(previousSibling.textContent)
)
)
)
}

function cell (content, node) {
const index = indexOf.call(node.parentNode.childNodes, node)
let prefix = ' '
if (index === 0) prefix = '| '
return prefix + content.trim() + ' |'
}

export default function tables (turndownService) {
for (const key in rules) turndownService.addRule(key, rules[key])
}
233 changes: 233 additions & 0 deletions test/__fixtures__/google-docs-2023-table.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,233 @@
<html>
<head>
<meta content="text/html; charset=UTF-8" http-equiv="content-type">
<style type="text/css">ol {
margin: 0;
padding: 0
}

table td, table th {
padding: 0
}

.c5 {
border-right-style: solid;
padding: 5pt 5pt 5pt 5pt;
border-bottom-color: #000000;
border-top-width: 1pt;
border-right-width: 1pt;
border-left-color: #000000;
vertical-align: top;
border-right-color: #000000;
border-left-width: 1pt;
border-top-style: solid;
border-left-style: solid;
border-bottom-width: 1pt;
width: 150.5pt;
border-top-color: #000000;
border-bottom-style: solid
}

.c8 {
padding-top: 18pt;
padding-bottom: 6pt;
line-height: 1.15;
page-break-after: avoid;
orphans: 2;
widows: 2;
text-align: left;
height: 16pt
}

.c1 {
padding-top: 0pt;
padding-bottom: 0pt;
line-height: 1.15;
orphans: 2;
widows: 2;
text-align: left;
height: 11pt
}

.c2 {
color: #000000;
font-weight: 400;
text-decoration: none;
vertical-align: baseline;
font-size: 16pt;
font-family: "Arial";
font-style: normal
}

.c7 {
color: #000000;
font-weight: 400;
text-decoration: none;
vertical-align: baseline;
font-size: 11pt;
font-family: "Arial";
font-style: normal
}

.c4 {
padding-top: 0pt;
padding-bottom: 0pt;
line-height: 1.0;
text-align: left
}

.c0 {
border-spacing: 0;
border-collapse: collapse;
margin-right: auto
}

.c3 {
background-color: #ffffff;
max-width: 451.4pt;
padding: 72pt 72pt 72pt 72pt
}

.c6 {
height: 0pt
}

.title {
padding-top: 0pt;
color: #000000;
font-size: 26pt;
padding-bottom: 3pt;
font-family: "Arial";
line-height: 1.15;
page-break-after: avoid;
orphans: 2;
widows: 2;
text-align: left
}

.subtitle {
padding-top: 0pt;
color: #666666;
font-size: 15pt;
padding-bottom: 16pt;
font-family: "Arial";
line-height: 1.15;
page-break-after: avoid;
orphans: 2;
widows: 2;
text-align: left
}

li {
color: #000000;
font-size: 11pt;
font-family: "Arial"
}

p {
margin: 0;
color: #000000;
font-size: 11pt;
font-family: "Arial"
}

h1 {
padding-top: 20pt;
color: #000000;
font-size: 20pt;
padding-bottom: 6pt;
font-family: "Arial";
line-height: 1.15;
page-break-after: avoid;
orphans: 2;
widows: 2;
text-align: left
}

h2 {
padding-top: 18pt;
color: #000000;
font-size: 16pt;
padding-bottom: 6pt;
font-family: "Arial";
line-height: 1.15;
page-break-after: avoid;
orphans: 2;
widows: 2;
text-align: left
}

h3 {
padding-top: 16pt;
color: #434343;
font-size: 14pt;
padding-bottom: 4pt;
font-family: "Arial";
line-height: 1.15;
page-break-after: avoid;
orphans: 2;
widows: 2;
text-align: left
}

h4 {
padding-top: 14pt;
color: #666666;
font-size: 12pt;
padding-bottom: 4pt;
font-family: "Arial";
line-height: 1.15;
page-break-after: avoid;
orphans: 2;
widows: 2;
text-align: left
}

h5 {
padding-top: 12pt;
color: #666666;
font-size: 11pt;
padding-bottom: 4pt;
font-family: "Arial";
line-height: 1.15;
page-break-after: avoid;
orphans: 2;
widows: 2;
text-align: left
}

h6 {
padding-top: 12pt;
color: #666666;
font-size: 11pt;
padding-bottom: 4pt;
font-family: "Arial";
line-height: 1.15;
page-break-after: avoid;
font-style: italic;
orphans: 2;
widows: 2;
text-align: left
}</style>
</head>
<body class="c3 doc-content"><h2 class="c8" id="h.f192ubeddvjb"><span class="c2"></span></h2><a
id="t.fb1748446ca41976868f81f73e9103f328eaf58c"></a><a id="t.0"></a>
<table class="c0">
<tr class="c6">
<td class="c5" colspan="1" rowspan="1"><p class="c4"><span class="c2">Header 1</span></p></td>
<td class="c5" colspan="1" rowspan="1"><p class="c4"><span class="c2">Header 2</span></p></td>
<td class="c5" colspan="1" rowspan="1"><p class="c4"><span class="c2">Header 3</span></p></td>
</tr>
<tr class="c6">
<td class="c5" colspan="1" rowspan="1"><p class="c4"><span class="c2">A</span></p></td>
<td class="c5" colspan="1" rowspan="1"><p class="c4"><span class="c2">B</span></p></td>
<td class="c5" colspan="1" rowspan="1"><p class="c4"><span class="c2">C</span></p></td>
</tr>
<tr class="c6">
<td class="c5" colspan="1" rowspan="1"><p class="c4"><span class="c2">D</span></p></td>
<td class="c5" colspan="1" rowspan="1"><p class="c4"><span class="c2">E</span></p></td>
<td class="c5" colspan="1" rowspan="1"><p class="c4"><span class="c2">F</span></p></td>
</tr>
</table>
</body>
</html>
11 changes: 11 additions & 0 deletions test/html-to-govspeak.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -338,3 +338,14 @@ it('Doesn\'t preserve markdown that is only a link with similar text to the link

expect(htmlToGovspeak(html)).toEqual('https://alphagov.github.io/paste-html-to-govspeak/')
})

it('Converts Google Docs tables to markdown', () => {
const html = openFixture('google-docs-2023-table.html')

expect(htmlToGovspeak(html)).toEqual(
'| Header 1 | Header 2 | Header 3 |\n' +
'| --- | --- | --- |\n' +
'| A | B | C |\n' +
'| D | E | F |'
)
})

0 comments on commit 711474b

Please sign in to comment.