Skip to content

Commit

Permalink
feat(MarkdownUtil): add sanitizeMarkdown function [WPB-10991] (#18581)
Browse files Browse the repository at this point in the history
* feat(MarkdownUtil): add sanitizeMarkdown function

* test(MarkdownUtil): enhance header detection and sanitization tests

* refactor(MarkdownUtil): consolidate code block patterns and improve inline code detection

* refactor(MarkdownUtil): enhance table pattern regex and improve link/image transformation logic

* refactor(MarkdownUtil): enhance table pattern regex to allow variable column lengths

* refactor(MarkdownUtil): reorganize and enhance markdown pattern definitions for improved detection and sanitization

* refactor(MarkdownUtil): rename list and table pattern constants for clarity and consistency
  • Loading branch information
olafsulich authored Jan 13, 2025
1 parent 9ce008c commit 124d517
Show file tree
Hide file tree
Showing 2 changed files with 251 additions and 43 deletions.
93 changes: 92 additions & 1 deletion src/script/util/MarkdownUtil.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
*
*/

import {isMarkdownText} from './MarkdownUtil';
import {isMarkdownText, sanitizeMarkdown} from './MarkdownUtil';

describe('MarkdownUtil', () => {
describe('isMarkdownText', () => {
Expand All @@ -28,6 +28,9 @@ describe('MarkdownUtil', () => {
it('returns true for headers', () => {
expect(isMarkdownText('# Header')).toBe(true);
expect(isMarkdownText('## Header')).toBe(true);
expect(isMarkdownText('### Header')).toBe(true);
expect(isMarkdownText('#### Header')).toBe(true);
expect(isMarkdownText('##### Header')).toBe(true);
expect(isMarkdownText('###### Header')).toBe(true);
});

Expand Down Expand Up @@ -113,4 +116,92 @@ describe('MarkdownUtil', () => {
expect(isMarkdownText('\\> Not a blockquote')).toBe(false);
});
});

describe('sanitizeMarkdown', () => {
it('returns empty string for falsy input', () => {
expect(sanitizeMarkdown('')).toBe('');
});

it('removes headers while preserving text', () => {
expect(sanitizeMarkdown('# Header 1')).toBe('Header 1');
expect(sanitizeMarkdown('## Header 2')).toBe('Header 2');
expect(sanitizeMarkdown('### Header 3')).toBe('Header 3');
expect(sanitizeMarkdown('#### Header 4')).toBe('Header 4');
expect(sanitizeMarkdown('##### Header 5')).toBe('Header 5');
expect(sanitizeMarkdown('###### Header 6')).toBe('Header 6');
});

it('removes bold formatting', () => {
expect(sanitizeMarkdown('**bold text**')).toBe('bold text');
expect(sanitizeMarkdown('__also bold__')).toBe('also bold');
expect(sanitizeMarkdown('normal **bold** normal')).toBe('normal bold normal');
});

it('removes italic formatting', () => {
expect(sanitizeMarkdown('*italic text*')).toBe('italic text');
expect(sanitizeMarkdown('_also italic_')).toBe('also italic');
expect(sanitizeMarkdown('normal *italic* normal')).toBe('normal italic normal');
});

it('removes links while preserving link text', () => {
expect(sanitizeMarkdown('[link text](http://example.com)')).toBe('link text');
expect(sanitizeMarkdown('Click [here](http://example.com) now')).toBe('Click here now');
expect(sanitizeMarkdown('[](http://example.com)')).toBe('');
});

it('removes list markers', () => {
expect(sanitizeMarkdown('- First item\n- Second item')).toBe('First item\nSecond item');
expect(sanitizeMarkdown('* Star item\n+ Plus item')).toBe('Star item\nPlus item');
expect(sanitizeMarkdown('1. First\n2. Second')).toBe('First\nSecond');
});

it('removes blockquotes', () => {
expect(sanitizeMarkdown('> quoted text')).toBe('quoted text');
expect(sanitizeMarkdown('> multiple\n> line quote')).toBe('multiple\nline quote');
});

it('removes code blocks', () => {
expect(sanitizeMarkdown('```\ncode block\n```')).toBe('code block');
expect(sanitizeMarkdown('`inline code`')).toBe('inline code');
expect(sanitizeMarkdown('```typescript\nconst x = 1;\n```')).toBe('typescript\nconst x = 1;');
});

it('removes table formatting', () => {
expect(sanitizeMarkdown('| Header |')).toBe('Header');
expect(sanitizeMarkdown('| Col 1 | Col 2 |\n|--|--|\n| Data 1 | Data 2 |')).toBe('Col 1 Col 2\n\nData 1 Data 2');
});

it('removes strikethrough', () => {
expect(sanitizeMarkdown('~~struck text~~')).toBe('struck text');
expect(sanitizeMarkdown('normal ~~struck~~ normal')).toBe('normal struck normal');
});

it('handles complex mixed markdown', () => {
const complexMarkdown = `# Main Title
**Important** _announcement_:
1. First [point](http://example.com)
2. Second point with ~~strike~~
> Quote with \`code\`
\`\`\`
Example code
\`\`\``;

const expected = `Main Title
Important announcement:
First point
Second point with strike
Quote with code
Example code`;

expect(sanitizeMarkdown(complexMarkdown).replace(/\s+/g, ' ').trim()).toBe(expected.replace(/\s+/g, ' ').trim());
});
});
});
201 changes: 159 additions & 42 deletions src/script/util/MarkdownUtil.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,59 +17,176 @@
*
*/

export const isMarkdownText = (text: string): boolean => {
if (!text) {
return false;
}

const markdownPatterns = [
// Headers (e.g. # Header)
/^#{1,6}\s+/m,
// Basic markdown patterns (used for both detection and sanitization)
const HEADER_PATTERN = /^#{1,6}\s+/m;
const BOLD_PATTERN_1 = /\*\*[^*]+\*\*/;
const BOLD_PATTERN_2 = /__[^_]+__/;
const ITALIC_PATTERN_1 = /\*[^*]+\*/;
const ITALIC_PATTERN_2 = /_[^_]+_/;
const LINK_PATTERN = /\[[^\]\r\n]{0,500}\]\([^()\r\n]{0,1000}\)/;
const IMAGE_PATTERN = /!\[[^\]]*\]\([^)]*\)/;
const BLOCKQUOTE_PATTERN = /^>\s+/gm;
const CODE_BLOCK_PATTERN = /```[\s\S]*?```/;
const CODE_INLINE_PATTERN = /`[^`]+`/;
const HORIZONTAL_RULE_PATTERN = /^(?:[-*_]){3,}\s*$/m;
const STRIKETHROUGH_PATTERN = /~~[^~]+~~/;

// Bold (e.g. **bold** or __bold__)
/\*\*[^*]+\*\*/,
/__[^_]+__/,
// List patterns
const LIST_UNORDERED_SANITIZE_PATTERN = /^[-*+]\s.*/gm;
const LIST_ORDERED_SANITIZE_PATTERN = /^[\d]+\.\s.*/gm;
const LIST_UNORDERED_DETECT_PATTERN = /^[-*+]\s.*/m;
const LIST_ORDERED_DETECT_PATTERN = /^[\d]+\.\s.*/m;

// Italic (e.g. *italic* or _italic_)
/\*[^*]+\*/,
/_[^_]+_/,
// Table patterns
const TABLE_ROW_DETECT_PATTERN = /^\|[^|]+\|/m;
const TABLE_SEPARATOR_DETECT_PATTERN = /^\|[-:|]+\|/m;
const TABLE_SANITIZE_PATTERN = /^\|.*\|$/gm;

// Links (e.g. [text](http://example.com))
/\[[^\]\r\n]{0,500}\]\([^()\r\n]{0,1000}\)/,
// Special patterns
const ESCAPED_CHARS_PATTERN = /\\(.)/g;
const INVALID_CHARS_PATTERN = /\\([\\`*_{}[\]()#+\-.!>])/;

// Images (e.g. ![alt](url))
/!\[[^\]]*\]\([^)]*\)/,
const MARKDOWN_PATTERNS = [
HEADER_PATTERN,
BOLD_PATTERN_1,
BOLD_PATTERN_2,
ITALIC_PATTERN_1,
ITALIC_PATTERN_2,
LINK_PATTERN,
IMAGE_PATTERN,
LIST_UNORDERED_DETECT_PATTERN,
LIST_ORDERED_DETECT_PATTERN,
BLOCKQUOTE_PATTERN,
CODE_BLOCK_PATTERN,
CODE_INLINE_PATTERN,
HORIZONTAL_RULE_PATTERN,
TABLE_ROW_DETECT_PATTERN,
TABLE_SEPARATOR_DETECT_PATTERN,
STRIKETHROUGH_PATTERN,
];

// Lists
/^[-*+]\s[^\n]*$/m, // Unordered (e.g. - item, * item)
/^\d+\.\s[^\n]*$/m, // Ordered (e.g. 1. item)

// Blockquotes (e.g. > quote)
/^>\s+/m,
/**
* Checks if the given text string contains markdown.
*/
export const isMarkdownText = (text: string): boolean => {
if (!text) {
return false;
}

// Code blocks (e.g. ``` code ``` or `inline code`)
/```[\s\S]*?```/,
/`[^`]+`/,
if (INVALID_CHARS_PATTERN.test(text)) {
return false;
}

// Horizontal rules (e.g. --- or *** or ___)
/^(?:[-*_]){3,}\s*$/m,
return MARKDOWN_PATTERNS.some(pattern => pattern.test(text));
};

// Tables (e.g. | Header | row | --- | :---: |)
/\|[^|]+\|/,
/^[-:|]+$/m,
/**
* Removes all markdown formatting from a given string.
*/
export const sanitizeMarkdown = (text: string): string => {
if (!text) {
return '';
}

// Strikethrough (e.g., ~~text~~)
/~~[^~]+~~/,
];
return markdownSanitizers
.reduce((sanitizedText, {pattern, transform}) => sanitizedText.replace(pattern, transform), text)
.trim();
};

const invalidPatterns = [
// Escaped markdown characters (\*not italic\*)
/\\([\\`*_{}[\]()#+\-.!>])/,
];
interface MarkdownSanitizer {
pattern: RegExp;
transform: (match: string, ...args: any[]) => string;
}

if (invalidPatterns.some(pattern => pattern.test(text))) {
const isTableSeparator = (line: string): boolean => {
if (!line.startsWith('|') || !line.endsWith('|')) {
return false;
}

return markdownPatterns.some(pattern => pattern.test(text));
const cells = line.slice(1, -1).split('|');
return cells.every(cell => /^[-:|]+$/.test(cell.trim()));
};

const markdownSanitizers: MarkdownSanitizer[] = [
{
pattern: ESCAPED_CHARS_PATTERN,
transform: (_match: string, char: string) => char,
},
{
pattern: HEADER_PATTERN,
transform: (_match: string) => '',
},
{
pattern: BOLD_PATTERN_1,
transform: (match: string) => match.slice(2, -2),
},
{
pattern: BOLD_PATTERN_2,
transform: (match: string) => match.slice(2, -2),
},
{
pattern: ITALIC_PATTERN_1,
transform: (match: string) => match.slice(1, -1),
},
{
pattern: ITALIC_PATTERN_2,
transform: (match: string) => match.slice(1, -1),
},
{
pattern: LINK_PATTERN,
transform: (match: string) => {
const start = match.indexOf('[') + 1;
const end = match.indexOf(']');
return start > 0 && end > start ? match.slice(start, end) : '';
},
},
{
pattern: IMAGE_PATTERN,
transform: (match: string) => {
const start = match.indexOf('[') + 1;
const end = match.indexOf(']');
return start > 0 && end > start ? match.slice(start, end) : '';
},
},
{
pattern: LIST_UNORDERED_SANITIZE_PATTERN,
transform: (match: string) => match.replace(/^[-*+]\s/, ''),
},
{
pattern: LIST_ORDERED_SANITIZE_PATTERN,
transform: (match: string) => match.replace(/^[\d]+\.\s/, ''),
},
{
pattern: BLOCKQUOTE_PATTERN,
transform: (_match: string) => '',
},
{
pattern: CODE_BLOCK_PATTERN,
transform: (match: string) => match.replace(/```/g, '').trim(),
},
{
pattern: CODE_INLINE_PATTERN,
transform: (match: string) => match.slice(1, -1),
},
{
pattern: HORIZONTAL_RULE_PATTERN,
transform: (_match: string) => '',
},
{
pattern: TABLE_SANITIZE_PATTERN,
transform: (match: string) => {
const line = match.trim();
if (isTableSeparator(line)) {
return '';
}
return line
.split('|')
.filter(cell => cell.trim())
.map(cell => cell.trim())
.join(' ');
},
},
{
pattern: STRIKETHROUGH_PATTERN,
transform: (match: string) => match.slice(2, -2),
},
];

0 comments on commit 124d517

Please sign in to comment.