Skip to content

Commit

Permalink
Ensure table plugin works for tables without a correctly formulated h…
Browse files Browse the repository at this point in the history
…eader row

Whilst technically HTML tables should contain a THEAD element with TH for each cell inside the header, most programs tested (including Google Docs, LibreOffice and Google Sheets) do not produce such tables by default. Instead the first row will still be a standard TR element with TD elements for each cell. This commit modifies the table handling code so that it will still work for incorrectly formatted tables. It assumes that the first row of the table can be used as the heading row and formats the markdown appropriately.
  • Loading branch information
ryanb-gds committed Sep 28, 2023
1 parent 48db70e commit a251d62
Show file tree
Hide file tree
Showing 3 changed files with 258 additions and 20 deletions.
34 changes: 14 additions & 20 deletions src/tables.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
var indexOf = Array.prototype.indexOf
var every = Array.prototype.every
var rules = {}
const indexOf = Array.prototype.indexOf
const rules = {}

rules.tableCell = {
filter: ['th', 'td'],
Expand All @@ -12,13 +11,13 @@ rules.tableCell = {
rules.tableRow = {
filter: 'tr',
replacement: function (content, node) {
var borderCells = ''
var alignMap = { left: ':--', right: '--:', center: ':-:' }
let borderCells = ''
const alignMap = { left: ':--', right: '--:', center: ':-:' }

if (isHeadingRow(node)) {
for (var i = 0; i < node.childNodes.length; i++) {
var border = '---'
var align = (
for (let i = 0; i < node.childNodes.length; i++) {
let border = '---'
const align = (
node.childNodes[i].getAttribute('align') || ''
).toLowerCase()

Expand Down Expand Up @@ -56,21 +55,19 @@ rules.tableSection = {
// - the parent is a THEAD
// - or if its the first child of the TABLE or the first TBODY (possibly
// following a blank THEAD)
// - and every cell is a TH
function isHeadingRow (tr) {
var parentNode = tr.parentNode
const parentNode = tr.parentNode
return (
parentNode.nodeName === 'THEAD' ||
(
parentNode.firstChild === tr &&
(parentNode.nodeName === 'TABLE' || isFirstTbody(parentNode)) &&
every.call(tr.childNodes, function (n) { return n.nodeName === 'TH' })
(parentNode.nodeName === 'TABLE' || isFirstTbody(parentNode))
)
)
}

function isFirstTbody (element) {
var previousSibling = element.previousSibling
const previousSibling = element.previousSibling
return (
element.nodeName === 'TBODY' && (
!previousSibling ||
Expand All @@ -83,15 +80,12 @@ function isFirstTbody (element) {
}

function cell (content, node) {
var index = indexOf.call(node.parentNode.childNodes, node)
var prefix = ' '
const index = indexOf.call(node.parentNode.childNodes, node)
let prefix = ' '
if (index === 0) prefix = '| '
return prefix + content.trim() + ' |'
}

export default function tables (turndownService) {
turndownService.keep(function (node) {
return node.nodeName === 'TABLE' && !isHeadingRow(node.rows[0])
})
for (var key in rules) turndownService.addRule(key, rules[key])
}
for (const key in rules) turndownService.addRule(key, rules[key])
}
233 changes: 233 additions & 0 deletions test/__fixtures__/google-docs-2023-table.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,233 @@
<html>
<head>
<meta content="text/html; charset=UTF-8" http-equiv="content-type">
<style type="text/css">ol {
margin: 0;
padding: 0
}

table td, table th {
padding: 0
}

.c5 {
border-right-style: solid;
padding: 5pt 5pt 5pt 5pt;
border-bottom-color: #000000;
border-top-width: 1pt;
border-right-width: 1pt;
border-left-color: #000000;
vertical-align: top;
border-right-color: #000000;
border-left-width: 1pt;
border-top-style: solid;
border-left-style: solid;
border-bottom-width: 1pt;
width: 150.5pt;
border-top-color: #000000;
border-bottom-style: solid
}

.c8 {
padding-top: 18pt;
padding-bottom: 6pt;
line-height: 1.15;
page-break-after: avoid;
orphans: 2;
widows: 2;
text-align: left;
height: 16pt
}

.c1 {
padding-top: 0pt;
padding-bottom: 0pt;
line-height: 1.15;
orphans: 2;
widows: 2;
text-align: left;
height: 11pt
}

.c2 {
color: #000000;
font-weight: 400;
text-decoration: none;
vertical-align: baseline;
font-size: 16pt;
font-family: "Arial";
font-style: normal
}

.c7 {
color: #000000;
font-weight: 400;
text-decoration: none;
vertical-align: baseline;
font-size: 11pt;
font-family: "Arial";
font-style: normal
}

.c4 {
padding-top: 0pt;
padding-bottom: 0pt;
line-height: 1.0;
text-align: left
}

.c0 {
border-spacing: 0;
border-collapse: collapse;
margin-right: auto
}

.c3 {
background-color: #ffffff;
max-width: 451.4pt;
padding: 72pt 72pt 72pt 72pt
}

.c6 {
height: 0pt
}

.title {
padding-top: 0pt;
color: #000000;
font-size: 26pt;
padding-bottom: 3pt;
font-family: "Arial";
line-height: 1.15;
page-break-after: avoid;
orphans: 2;
widows: 2;
text-align: left
}

.subtitle {
padding-top: 0pt;
color: #666666;
font-size: 15pt;
padding-bottom: 16pt;
font-family: "Arial";
line-height: 1.15;
page-break-after: avoid;
orphans: 2;
widows: 2;
text-align: left
}

li {
color: #000000;
font-size: 11pt;
font-family: "Arial"
}

p {
margin: 0;
color: #000000;
font-size: 11pt;
font-family: "Arial"
}

h1 {
padding-top: 20pt;
color: #000000;
font-size: 20pt;
padding-bottom: 6pt;
font-family: "Arial";
line-height: 1.15;
page-break-after: avoid;
orphans: 2;
widows: 2;
text-align: left
}

h2 {
padding-top: 18pt;
color: #000000;
font-size: 16pt;
padding-bottom: 6pt;
font-family: "Arial";
line-height: 1.15;
page-break-after: avoid;
orphans: 2;
widows: 2;
text-align: left
}

h3 {
padding-top: 16pt;
color: #434343;
font-size: 14pt;
padding-bottom: 4pt;
font-family: "Arial";
line-height: 1.15;
page-break-after: avoid;
orphans: 2;
widows: 2;
text-align: left
}

h4 {
padding-top: 14pt;
color: #666666;
font-size: 12pt;
padding-bottom: 4pt;
font-family: "Arial";
line-height: 1.15;
page-break-after: avoid;
orphans: 2;
widows: 2;
text-align: left
}

h5 {
padding-top: 12pt;
color: #666666;
font-size: 11pt;
padding-bottom: 4pt;
font-family: "Arial";
line-height: 1.15;
page-break-after: avoid;
orphans: 2;
widows: 2;
text-align: left
}

h6 {
padding-top: 12pt;
color: #666666;
font-size: 11pt;
padding-bottom: 4pt;
font-family: "Arial";
line-height: 1.15;
page-break-after: avoid;
font-style: italic;
orphans: 2;
widows: 2;
text-align: left
}</style>
</head>
<body class="c3 doc-content"><h2 class="c8" id="h.f192ubeddvjb"><span class="c2"></span></h2><a
id="t.fb1748446ca41976868f81f73e9103f328eaf58c"></a><a id="t.0"></a>
<table class="c0">
<tr class="c6">
<td class="c5" colspan="1" rowspan="1"><p class="c4"><span class="c2">Header 1</span></p></td>
<td class="c5" colspan="1" rowspan="1"><p class="c4"><span class="c2">Header 2</span></p></td>
<td class="c5" colspan="1" rowspan="1"><p class="c4"><span class="c2">Header 3</span></p></td>
</tr>
<tr class="c6">
<td class="c5" colspan="1" rowspan="1"><p class="c4"><span class="c2">A</span></p></td>
<td class="c5" colspan="1" rowspan="1"><p class="c4"><span class="c2">B</span></p></td>
<td class="c5" colspan="1" rowspan="1"><p class="c4"><span class="c2">C</span></p></td>
</tr>
<tr class="c6">
<td class="c5" colspan="1" rowspan="1"><p class="c4"><span class="c2">D</span></p></td>
<td class="c5" colspan="1" rowspan="1"><p class="c4"><span class="c2">E</span></p></td>
<td class="c5" colspan="1" rowspan="1"><p class="c4"><span class="c2">F</span></p></td>
</tr>
</table>
</body>
</html>
11 changes: 11 additions & 0 deletions test/html-to-govspeak.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -338,3 +338,14 @@ it('Doesn\'t preserve markdown that is only a link with similar text to the link

expect(htmlToGovspeak(html)).toEqual('https://alphagov.github.io/paste-html-to-govspeak/')
})

it('Converts Google Docs tables to markdown', () => {
const html = openFixture('google-docs-2023-table.html')

expect(htmlToGovspeak(html)).toEqual(
'| Header 1 | Header 2 | Header 3 |\n' +
'| --- | --- | --- |\n' +
'| A | B | C |\n' +
'| D | E | F |'
)
})

0 comments on commit a251d62

Please sign in to comment.