From ea674a2aca319ad75e8e65ee446cf85d72df425e Mon Sep 17 00:00:00 2001 From: Thomas Timmer Date: Thu, 21 Nov 2024 11:52:40 +0100 Subject: [PATCH] feat: add replace special characters option to parse document --- __tests__/parse-document/1.1/index.test.js | 54 +++++++++++++++++ __tests__/support/document-parser/index.js | 14 ++++- blocks/parse-document.json | 2 +- functions/parse-document/1.1/function.json | 69 ++++++++++++++++++++++ functions/parse-document/1.1/index.js | 24 ++++++++ package.json | 5 +- 6 files changed, 162 insertions(+), 6 deletions(-) create mode 100644 __tests__/parse-document/1.1/index.test.js create mode 100644 functions/parse-document/1.1/function.json create mode 100644 functions/parse-document/1.1/index.js diff --git a/__tests__/parse-document/1.1/index.test.js b/__tests__/parse-document/1.1/index.test.js new file mode 100644 index 0000000..a5a69eb --- /dev/null +++ b/__tests__/parse-document/1.1/index.test.js @@ -0,0 +1,54 @@ +import parseDocument from '../../../functions/parse-document/1.1'; + +describe('Parse document', () => { + test('It returns a string based on a URL', async () => { + const { result } = await parseDocument({ + document: 'https://www.example.com/document.pdf', + density: 300, + forceImage: false, + }); + + expect(result).toEqual('Dummy result'); + }); + + test('It returns a string based on a file property', async () => { + const { result } = await parseDocument({ + document: { + name: 'document.pdf', + url: 'https://www.example.com/document.pdf', + }, + density: 300, + forceImage: false, + }); + + expect(result).toEqual('Dummy result'); + }); + + test('It replaces special sequence of characters to spaces if enabled', async () => { + const { result } = await parseDocument({ + document: { + name: 'special-document.pdf', + url: 'https://www.example.com/special-document.pdf', + }, + density: 300, + forceImage: false, + removeSpecialCharacters: true, + }); + + expect(result).toEqual('Dummy result with special characters'); + }); + + test("It doesn't replaces special sequence of characters to spaces if disabled", async () => { + const { result } = await parseDocument({ + document: { + name: 'special-document.pdf', + url: 'https://www.example.com/special-document.pdf', + }, + density: 300, + forceImage: false, + removeSpecialCharacters: false, + }); + + expect(result).toEqual('Dummy result with special characters'); + }); +}); diff --git a/__tests__/support/document-parser/index.js b/__tests__/support/document-parser/index.js index 7645b50..f93aa62 100644 --- a/__tests__/support/document-parser/index.js +++ b/__tests__/support/document-parser/index.js @@ -1,5 +1,13 @@ -const documentParser = async (_) => ({ - result: 'Dummy result', -}); +const documentParser = async ({ document }) => { + if (document === 'https://www.example.com/special-document.pdf') { + return { + result: 'Dummy result with special characters', + }; + } + + return { + result: 'Dummy result', + }; +}; export default documentParser; diff --git a/blocks/parse-document.json b/blocks/parse-document.json index 28488bd..6cb21a4 100644 --- a/blocks/parse-document.json +++ b/blocks/parse-document.json @@ -1,5 +1,5 @@ { "dependencies": [], - "functions": ["parseDocument 1.0"], + "functions": ["parseDocument 1.1"], "includes": [] } diff --git a/functions/parse-document/1.1/function.json b/functions/parse-document/1.1/function.json new file mode 100644 index 0000000..9da90bf --- /dev/null +++ b/functions/parse-document/1.1/function.json @@ -0,0 +1,69 @@ +{ + "description": "Extract and convert textual information from various document formats, including docx, pptx, xlsx, odt, odp, ods, and pdf files.", + "label": "Parse Document to Text", + "category": "Misc", + "icon": { + "name": "FileIcon", + "color": "Orange" + }, + "options": [ + { + "meta": { + "type": "Value", + "allowedKinds": ["STRING", "FILE"], + "validations": { + "required": true + } + }, + "name": "document", + "label": "Document", + "info": "Specify the document to be parsed. This can be a file property or an URL." + }, + { + "meta": { + "type": "Number", + "default": 300, + "validations": { + "required": true + } + }, + "advanced": true, + "name": "density", + "label": "Density", + "info": "This option specifies the image resolution. The higher the density, the better the quality of the output will be. However, higher density also means slower processing." + }, + { + "meta": { + "type": "Boolean" + }, + "advanced": true, + "name": "forceImage", + "label": "Force image", + "info": "Force the document to be scanned as an image. Sometimes this can result in a better output." + }, + { + "meta": { + "type": "Boolean", + "default": true + }, + "advanced": true, + "name": "removeSpecialCharacters", + "label": "Remove Special Characters", + "info": "Remove special characters from the output." + }, + { + "name": "result", + "label": "Result", + "meta": { + "type": "Output", + "validations": { + "required": true + }, + "output": { + "type": "Text" + } + } + } + ], + "yields": "NONE" +} diff --git a/functions/parse-document/1.1/index.js b/functions/parse-document/1.1/index.js new file mode 100644 index 0000000..e47e59b --- /dev/null +++ b/functions/parse-document/1.1/index.js @@ -0,0 +1,24 @@ +const isFileProperty = (value) => + value && typeof value === 'object' && 'url' in value; + +const parseDocument = async ({ + document, + density, + forceImage, + removeSpecialCharacters, +}) => { + const url = isFileProperty(document) ? document?.url : document; + + const { result } = await documentParser({ + document: url, + parserOptions: { density, forceImage }, + }); + + if (removeSpecialCharacters && typeof result === 'string') { + return { result: result.replaceAll(' ', ' ') }; + } + + return { result }; +}; + +export default parseDocument; diff --git a/package.json b/package.json index de8bdd8..7feb807 100644 --- a/package.json +++ b/package.json @@ -5,8 +5,9 @@ "prepare": "husky install", "lint": "eslint", "prettier:base": "prettier --single-quote", - "prettier:check": "yarn prettier:base --list-different \"{functions,__tests__,isolate-tests}/**/*.js\"", - "prettier:write": "yarn prettier:base --write \"{functions,__tests__,isolate-tests}/**/*.js\"", + "prettier:check": "pnpm run prettier:base --list-different \"{functions,__tests__,isolate-tests}/**/*.js\"", + "prettier:write": "pnpm run prettier:base --write \"{functions,__tests__,isolate-tests}/**/*.js\"", + "fmt": "pnpm run --silent prettier:write", "test": "vitest run", "test:isolate": "bb functions test isolate-tests/", "publish": "bb blocks publish"