From 124d5177b659091023e86ee91838ad890027cba2 Mon Sep 17 00:00:00 2001 From: Olaf Sulich Date: Mon, 13 Jan 2025 13:37:25 +0100 Subject: [PATCH] feat(MarkdownUtil): add sanitizeMarkdown function [WPB-10991] (#18581) * feat(MarkdownUtil): add sanitizeMarkdown function * test(MarkdownUtil): enhance header detection and sanitization tests * refactor(MarkdownUtil): consolidate code block patterns and improve inline code detection * refactor(MarkdownUtil): enhance table pattern regex and improve link/image transformation logic * refactor(MarkdownUtil): enhance table pattern regex to allow variable column lengths * refactor(MarkdownUtil): reorganize and enhance markdown pattern definitions for improved detection and sanitization * refactor(MarkdownUtil): rename list and table pattern constants for clarity and consistency --- src/script/util/MarkdownUtil.test.ts | 93 ++++++++++++- src/script/util/MarkdownUtil.ts | 201 +++++++++++++++++++++------ 2 files changed, 251 insertions(+), 43 deletions(-) diff --git a/src/script/util/MarkdownUtil.test.ts b/src/script/util/MarkdownUtil.test.ts index dfd1c5f36f9..f6c51e1b161 100644 --- a/src/script/util/MarkdownUtil.test.ts +++ b/src/script/util/MarkdownUtil.test.ts @@ -17,7 +17,7 @@ * */ -import {isMarkdownText} from './MarkdownUtil'; +import {isMarkdownText, sanitizeMarkdown} from './MarkdownUtil'; describe('MarkdownUtil', () => { describe('isMarkdownText', () => { @@ -28,6 +28,9 @@ describe('MarkdownUtil', () => { it('returns true for headers', () => { expect(isMarkdownText('# Header')).toBe(true); expect(isMarkdownText('## Header')).toBe(true); + expect(isMarkdownText('### Header')).toBe(true); + expect(isMarkdownText('#### Header')).toBe(true); + expect(isMarkdownText('##### Header')).toBe(true); expect(isMarkdownText('###### Header')).toBe(true); }); @@ -113,4 +116,92 @@ describe('MarkdownUtil', () => { expect(isMarkdownText('\\> Not a blockquote')).toBe(false); }); }); + + describe('sanitizeMarkdown', () => { + it('returns empty string for falsy input', () => { + expect(sanitizeMarkdown('')).toBe(''); + }); + + it('removes headers while preserving text', () => { + expect(sanitizeMarkdown('# Header 1')).toBe('Header 1'); + expect(sanitizeMarkdown('## Header 2')).toBe('Header 2'); + expect(sanitizeMarkdown('### Header 3')).toBe('Header 3'); + expect(sanitizeMarkdown('#### Header 4')).toBe('Header 4'); + expect(sanitizeMarkdown('##### Header 5')).toBe('Header 5'); + expect(sanitizeMarkdown('###### Header 6')).toBe('Header 6'); + }); + + it('removes bold formatting', () => { + expect(sanitizeMarkdown('**bold text**')).toBe('bold text'); + expect(sanitizeMarkdown('__also bold__')).toBe('also bold'); + expect(sanitizeMarkdown('normal **bold** normal')).toBe('normal bold normal'); + }); + + it('removes italic formatting', () => { + expect(sanitizeMarkdown('*italic text*')).toBe('italic text'); + expect(sanitizeMarkdown('_also italic_')).toBe('also italic'); + expect(sanitizeMarkdown('normal *italic* normal')).toBe('normal italic normal'); + }); + + it('removes links while preserving link text', () => { + expect(sanitizeMarkdown('[link text](http://example.com)')).toBe('link text'); + expect(sanitizeMarkdown('Click [here](http://example.com) now')).toBe('Click here now'); + expect(sanitizeMarkdown('[](http://example.com)')).toBe(''); + }); + + it('removes list markers', () => { + expect(sanitizeMarkdown('- First item\n- Second item')).toBe('First item\nSecond item'); + expect(sanitizeMarkdown('* Star item\n+ Plus item')).toBe('Star item\nPlus item'); + expect(sanitizeMarkdown('1. First\n2. Second')).toBe('First\nSecond'); + }); + + it('removes blockquotes', () => { + expect(sanitizeMarkdown('> quoted text')).toBe('quoted text'); + expect(sanitizeMarkdown('> multiple\n> line quote')).toBe('multiple\nline quote'); + }); + + it('removes code blocks', () => { + expect(sanitizeMarkdown('```\ncode block\n```')).toBe('code block'); + expect(sanitizeMarkdown('`inline code`')).toBe('inline code'); + expect(sanitizeMarkdown('```typescript\nconst x = 1;\n```')).toBe('typescript\nconst x = 1;'); + }); + + it('removes table formatting', () => { + expect(sanitizeMarkdown('| Header |')).toBe('Header'); + expect(sanitizeMarkdown('| Col 1 | Col 2 |\n|--|--|\n| Data 1 | Data 2 |')).toBe('Col 1 Col 2\n\nData 1 Data 2'); + }); + + it('removes strikethrough', () => { + expect(sanitizeMarkdown('~~struck text~~')).toBe('struck text'); + expect(sanitizeMarkdown('normal ~~struck~~ normal')).toBe('normal struck normal'); + }); + + it('handles complex mixed markdown', () => { + const complexMarkdown = `# Main Title + +**Important** _announcement_: + +1. First [point](http://example.com) +2. Second point with ~~strike~~ + +> Quote with \`code\` + +\`\`\` +Example code +\`\`\``; + + const expected = `Main Title + +Important announcement: + +First point +Second point with strike + +Quote with code + +Example code`; + + expect(sanitizeMarkdown(complexMarkdown).replace(/\s+/g, ' ').trim()).toBe(expected.replace(/\s+/g, ' ').trim()); + }); + }); }); diff --git a/src/script/util/MarkdownUtil.ts b/src/script/util/MarkdownUtil.ts index 1f5ec0e836e..bfdb9d91329 100644 --- a/src/script/util/MarkdownUtil.ts +++ b/src/script/util/MarkdownUtil.ts @@ -17,59 +17,176 @@ * */ -export const isMarkdownText = (text: string): boolean => { - if (!text) { - return false; - } - - const markdownPatterns = [ - // Headers (e.g. # Header) - /^#{1,6}\s+/m, +// Basic markdown patterns (used for both detection and sanitization) +const HEADER_PATTERN = /^#{1,6}\s+/m; +const BOLD_PATTERN_1 = /\*\*[^*]+\*\*/; +const BOLD_PATTERN_2 = /__[^_]+__/; +const ITALIC_PATTERN_1 = /\*[^*]+\*/; +const ITALIC_PATTERN_2 = /_[^_]+_/; +const LINK_PATTERN = /\[[^\]\r\n]{0,500}\]\([^()\r\n]{0,1000}\)/; +const IMAGE_PATTERN = /!\[[^\]]*\]\([^)]*\)/; +const BLOCKQUOTE_PATTERN = /^>\s+/gm; +const CODE_BLOCK_PATTERN = /```[\s\S]*?```/; +const CODE_INLINE_PATTERN = /`[^`]+`/; +const HORIZONTAL_RULE_PATTERN = /^(?:[-*_]){3,}\s*$/m; +const STRIKETHROUGH_PATTERN = /~~[^~]+~~/; - // Bold (e.g. **bold** or __bold__) - /\*\*[^*]+\*\*/, - /__[^_]+__/, +// List patterns +const LIST_UNORDERED_SANITIZE_PATTERN = /^[-*+]\s.*/gm; +const LIST_ORDERED_SANITIZE_PATTERN = /^[\d]+\.\s.*/gm; +const LIST_UNORDERED_DETECT_PATTERN = /^[-*+]\s.*/m; +const LIST_ORDERED_DETECT_PATTERN = /^[\d]+\.\s.*/m; - // Italic (e.g. *italic* or _italic_) - /\*[^*]+\*/, - /_[^_]+_/, +// Table patterns +const TABLE_ROW_DETECT_PATTERN = /^\|[^|]+\|/m; +const TABLE_SEPARATOR_DETECT_PATTERN = /^\|[-:|]+\|/m; +const TABLE_SANITIZE_PATTERN = /^\|.*\|$/gm; - // Links (e.g. [text](http://example.com)) - /\[[^\]\r\n]{0,500}\]\([^()\r\n]{0,1000}\)/, +// Special patterns +const ESCAPED_CHARS_PATTERN = /\\(.)/g; +const INVALID_CHARS_PATTERN = /\\([\\`*_{}[\]()#+\-.!>])/; - // Images (e.g. ![alt](url)) - /!\[[^\]]*\]\([^)]*\)/, +const MARKDOWN_PATTERNS = [ + HEADER_PATTERN, + BOLD_PATTERN_1, + BOLD_PATTERN_2, + ITALIC_PATTERN_1, + ITALIC_PATTERN_2, + LINK_PATTERN, + IMAGE_PATTERN, + LIST_UNORDERED_DETECT_PATTERN, + LIST_ORDERED_DETECT_PATTERN, + BLOCKQUOTE_PATTERN, + CODE_BLOCK_PATTERN, + CODE_INLINE_PATTERN, + HORIZONTAL_RULE_PATTERN, + TABLE_ROW_DETECT_PATTERN, + TABLE_SEPARATOR_DETECT_PATTERN, + STRIKETHROUGH_PATTERN, +]; - // Lists - /^[-*+]\s[^\n]*$/m, // Unordered (e.g. - item, * item) - /^\d+\.\s[^\n]*$/m, // Ordered (e.g. 1. item) - - // Blockquotes (e.g. > quote) - /^>\s+/m, +/** + * Checks if the given text string contains markdown. + */ +export const isMarkdownText = (text: string): boolean => { + if (!text) { + return false; + } - // Code blocks (e.g. ``` code ``` or `inline code`) - /```[\s\S]*?```/, - /`[^`]+`/, + if (INVALID_CHARS_PATTERN.test(text)) { + return false; + } - // Horizontal rules (e.g. --- or *** or ___) - /^(?:[-*_]){3,}\s*$/m, + return MARKDOWN_PATTERNS.some(pattern => pattern.test(text)); +}; - // Tables (e.g. | Header | row | --- | :---: |) - /\|[^|]+\|/, - /^[-:|]+$/m, +/** + * Removes all markdown formatting from a given string. + */ +export const sanitizeMarkdown = (text: string): string => { + if (!text) { + return ''; + } - // Strikethrough (e.g., ~~text~~) - /~~[^~]+~~/, - ]; + return markdownSanitizers + .reduce((sanitizedText, {pattern, transform}) => sanitizedText.replace(pattern, transform), text) + .trim(); +}; - const invalidPatterns = [ - // Escaped markdown characters (\*not italic\*) - /\\([\\`*_{}[\]()#+\-.!>])/, - ]; +interface MarkdownSanitizer { + pattern: RegExp; + transform: (match: string, ...args: any[]) => string; +} - if (invalidPatterns.some(pattern => pattern.test(text))) { +const isTableSeparator = (line: string): boolean => { + if (!line.startsWith('|') || !line.endsWith('|')) { return false; } - - return markdownPatterns.some(pattern => pattern.test(text)); + const cells = line.slice(1, -1).split('|'); + return cells.every(cell => /^[-:|]+$/.test(cell.trim())); }; + +const markdownSanitizers: MarkdownSanitizer[] = [ + { + pattern: ESCAPED_CHARS_PATTERN, + transform: (_match: string, char: string) => char, + }, + { + pattern: HEADER_PATTERN, + transform: (_match: string) => '', + }, + { + pattern: BOLD_PATTERN_1, + transform: (match: string) => match.slice(2, -2), + }, + { + pattern: BOLD_PATTERN_2, + transform: (match: string) => match.slice(2, -2), + }, + { + pattern: ITALIC_PATTERN_1, + transform: (match: string) => match.slice(1, -1), + }, + { + pattern: ITALIC_PATTERN_2, + transform: (match: string) => match.slice(1, -1), + }, + { + pattern: LINK_PATTERN, + transform: (match: string) => { + const start = match.indexOf('[') + 1; + const end = match.indexOf(']'); + return start > 0 && end > start ? match.slice(start, end) : ''; + }, + }, + { + pattern: IMAGE_PATTERN, + transform: (match: string) => { + const start = match.indexOf('[') + 1; + const end = match.indexOf(']'); + return start > 0 && end > start ? match.slice(start, end) : ''; + }, + }, + { + pattern: LIST_UNORDERED_SANITIZE_PATTERN, + transform: (match: string) => match.replace(/^[-*+]\s/, ''), + }, + { + pattern: LIST_ORDERED_SANITIZE_PATTERN, + transform: (match: string) => match.replace(/^[\d]+\.\s/, ''), + }, + { + pattern: BLOCKQUOTE_PATTERN, + transform: (_match: string) => '', + }, + { + pattern: CODE_BLOCK_PATTERN, + transform: (match: string) => match.replace(/```/g, '').trim(), + }, + { + pattern: CODE_INLINE_PATTERN, + transform: (match: string) => match.slice(1, -1), + }, + { + pattern: HORIZONTAL_RULE_PATTERN, + transform: (_match: string) => '', + }, + { + pattern: TABLE_SANITIZE_PATTERN, + transform: (match: string) => { + const line = match.trim(); + if (isTableSeparator(line)) { + return ''; + } + return line + .split('|') + .filter(cell => cell.trim()) + .map(cell => cell.trim()) + .join(' '); + }, + }, + { + pattern: STRIKETHROUGH_PATTERN, + transform: (match: string) => match.slice(2, -2), + }, +];