Skip to content

Commit

Permalink
feat: add replace special characters option to parse document
Browse files Browse the repository at this point in the history
  • Loading branch information
Thomas Timmer committed Nov 21, 2024
1 parent 730e0e1 commit ea674a2
Show file tree
Hide file tree
Showing 6 changed files with 162 additions and 6 deletions.
54 changes: 54 additions & 0 deletions __tests__/parse-document/1.1/index.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import parseDocument from '../../../functions/parse-document/1.1';

describe('Parse document', () => {
test('It returns a string based on a URL', async () => {
const { result } = await parseDocument({
document: 'https://www.example.com/document.pdf',
density: 300,
forceImage: false,
});

expect(result).toEqual('Dummy result');
});

test('It returns a string based on a file property', async () => {
const { result } = await parseDocument({
document: {
name: 'document.pdf',
url: 'https://www.example.com/document.pdf',
},
density: 300,
forceImage: false,
});

expect(result).toEqual('Dummy result');
});

test('It replaces special sequence of characters to spaces if enabled', async () => {
const { result } = await parseDocument({
document: {
name: 'special-document.pdf',
url: 'https://www.example.com/special-document.pdf',
},
density: 300,
forceImage: false,
removeSpecialCharacters: true,
});

expect(result).toEqual('Dummy result with special characters');
});

test("It doesn't replaces special sequence of characters to spaces if disabled", async () => {
const { result } = await parseDocument({
document: {
name: 'special-document.pdf',
url: 'https://www.example.com/special-document.pdf',
},
density: 300,
forceImage: false,
removeSpecialCharacters: false,
});

expect(result).toEqual('Dummy result with special characters');
});
});
14 changes: 11 additions & 3 deletions __tests__/support/document-parser/index.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,13 @@
const documentParser = async (_) => ({
result: 'Dummy result',
});
const documentParser = async ({ document }) => {
if (document === 'https://www.example.com/special-document.pdf') {
return {
result: 'Dummy result with special characters',
};
}

return {
result: 'Dummy result',
};
};

export default documentParser;
2 changes: 1 addition & 1 deletion blocks/parse-document.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"dependencies": [],
"functions": ["parseDocument 1.0"],
"functions": ["parseDocument 1.1"],
"includes": []
}
69 changes: 69 additions & 0 deletions functions/parse-document/1.1/function.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
{
"description": "Extract and convert textual information from various document formats, including docx, pptx, xlsx, odt, odp, ods, and pdf files.",
"label": "Parse Document to Text",
"category": "Misc",
"icon": {
"name": "FileIcon",
"color": "Orange"
},
"options": [
{
"meta": {
"type": "Value",
"allowedKinds": ["STRING", "FILE"],
"validations": {
"required": true
}
},
"name": "document",
"label": "Document",
"info": "Specify the document to be parsed. This can be a file property or an URL."
},
{
"meta": {
"type": "Number",
"default": 300,
"validations": {
"required": true
}
},
"advanced": true,
"name": "density",
"label": "Density",
"info": "This option specifies the image resolution. The higher the density, the better the quality of the output will be. However, higher density also means slower processing."
},
{
"meta": {
"type": "Boolean"
},
"advanced": true,
"name": "forceImage",
"label": "Force image",
"info": "Force the document to be scanned as an image. Sometimes this can result in a better output."
},
{
"meta": {
"type": "Boolean",
"default": true
},
"advanced": true,
"name": "removeSpecialCharacters",
"label": "Remove Special Characters",
"info": "Remove special characters from the output."
},
{
"name": "result",
"label": "Result",
"meta": {
"type": "Output",
"validations": {
"required": true
},
"output": {
"type": "Text"
}
}
}
],
"yields": "NONE"
}
24 changes: 24 additions & 0 deletions functions/parse-document/1.1/index.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
const isFileProperty = (value) =>
value && typeof value === 'object' && 'url' in value;

const parseDocument = async ({
document,
density,
forceImage,
removeSpecialCharacters,
}) => {
const url = isFileProperty(document) ? document?.url : document;

const { result } = await documentParser({
document: url,
parserOptions: { density, forceImage },
});

if (removeSpecialCharacters && typeof result === 'string') {
return { result: result.replaceAll(' ', ' ') };
}

return { result };
};

export default parseDocument;
5 changes: 3 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,9 @@
"prepare": "husky install",
"lint": "eslint",
"prettier:base": "prettier --single-quote",
"prettier:check": "yarn prettier:base --list-different \"{functions,__tests__,isolate-tests}/**/*.js\"",
"prettier:write": "yarn prettier:base --write \"{functions,__tests__,isolate-tests}/**/*.js\"",
"prettier:check": "pnpm run prettier:base --list-different \"{functions,__tests__,isolate-tests}/**/*.js\"",
"prettier:write": "pnpm run prettier:base --write \"{functions,__tests__,isolate-tests}/**/*.js\"",
"fmt": "pnpm run --silent prettier:write",
"test": "vitest run",
"test:isolate": "bb functions test isolate-tests/",
"publish": "bb blocks publish"
Expand Down

0 comments on commit ea674a2

Please sign in to comment.