-
Notifications
You must be signed in to change notification settings - Fork 19
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: add replace special characters option to parse document
- Loading branch information
Thomas Timmer
committed
Nov 21, 2024
1 parent
730e0e1
commit ea674a2
Showing
6 changed files
with
162 additions
and
6 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
import parseDocument from '../../../functions/parse-document/1.1'; | ||
|
||
describe('Parse document', () => { | ||
test('It returns a string based on a URL', async () => { | ||
const { result } = await parseDocument({ | ||
document: 'https://www.example.com/document.pdf', | ||
density: 300, | ||
forceImage: false, | ||
}); | ||
|
||
expect(result).toEqual('Dummy result'); | ||
}); | ||
|
||
test('It returns a string based on a file property', async () => { | ||
const { result } = await parseDocument({ | ||
document: { | ||
name: 'document.pdf', | ||
url: 'https://www.example.com/document.pdf', | ||
}, | ||
density: 300, | ||
forceImage: false, | ||
}); | ||
|
||
expect(result).toEqual('Dummy result'); | ||
}); | ||
|
||
test('It replaces special sequence of characters to spaces if enabled', async () => { | ||
const { result } = await parseDocument({ | ||
document: { | ||
name: 'special-document.pdf', | ||
url: 'https://www.example.com/special-document.pdf', | ||
}, | ||
density: 300, | ||
forceImage: false, | ||
removeSpecialCharacters: true, | ||
}); | ||
|
||
expect(result).toEqual('Dummy result with special characters'); | ||
}); | ||
|
||
test("It doesn't replaces special sequence of characters to spaces if disabled", async () => { | ||
const { result } = await parseDocument({ | ||
document: { | ||
name: 'special-document.pdf', | ||
url: 'https://www.example.com/special-document.pdf', | ||
}, | ||
density: 300, | ||
forceImage: false, | ||
removeSpecialCharacters: false, | ||
}); | ||
|
||
expect(result).toEqual('Dummy result with special characters'); | ||
}); | ||
}); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,13 @@ | ||
const documentParser = async (_) => ({ | ||
result: 'Dummy result', | ||
}); | ||
const documentParser = async ({ document }) => { | ||
if (document === 'https://www.example.com/special-document.pdf') { | ||
return { | ||
result: 'Dummy result with special characters', | ||
}; | ||
} | ||
|
||
return { | ||
result: 'Dummy result', | ||
}; | ||
}; | ||
|
||
export default documentParser; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,5 @@ | ||
{ | ||
"dependencies": [], | ||
"functions": ["parseDocument 1.0"], | ||
"functions": ["parseDocument 1.1"], | ||
"includes": [] | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
{ | ||
"description": "Extract and convert textual information from various document formats, including docx, pptx, xlsx, odt, odp, ods, and pdf files.", | ||
"label": "Parse Document to Text", | ||
"category": "Misc", | ||
"icon": { | ||
"name": "FileIcon", | ||
"color": "Orange" | ||
}, | ||
"options": [ | ||
{ | ||
"meta": { | ||
"type": "Value", | ||
"allowedKinds": ["STRING", "FILE"], | ||
"validations": { | ||
"required": true | ||
} | ||
}, | ||
"name": "document", | ||
"label": "Document", | ||
"info": "Specify the document to be parsed. This can be a file property or an URL." | ||
}, | ||
{ | ||
"meta": { | ||
"type": "Number", | ||
"default": 300, | ||
"validations": { | ||
"required": true | ||
} | ||
}, | ||
"advanced": true, | ||
"name": "density", | ||
"label": "Density", | ||
"info": "This option specifies the image resolution. The higher the density, the better the quality of the output will be. However, higher density also means slower processing." | ||
}, | ||
{ | ||
"meta": { | ||
"type": "Boolean" | ||
}, | ||
"advanced": true, | ||
"name": "forceImage", | ||
"label": "Force image", | ||
"info": "Force the document to be scanned as an image. Sometimes this can result in a better output." | ||
}, | ||
{ | ||
"meta": { | ||
"type": "Boolean", | ||
"default": true | ||
}, | ||
"advanced": true, | ||
"name": "removeSpecialCharacters", | ||
"label": "Remove Special Characters", | ||
"info": "Remove special characters from the output." | ||
}, | ||
{ | ||
"name": "result", | ||
"label": "Result", | ||
"meta": { | ||
"type": "Output", | ||
"validations": { | ||
"required": true | ||
}, | ||
"output": { | ||
"type": "Text" | ||
} | ||
} | ||
} | ||
], | ||
"yields": "NONE" | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
const isFileProperty = (value) => | ||
value && typeof value === 'object' && 'url' in value; | ||
|
||
const parseDocument = async ({ | ||
document, | ||
density, | ||
forceImage, | ||
removeSpecialCharacters, | ||
}) => { | ||
const url = isFileProperty(document) ? document?.url : document; | ||
|
||
const { result } = await documentParser({ | ||
document: url, | ||
parserOptions: { density, forceImage }, | ||
}); | ||
|
||
if (removeSpecialCharacters && typeof result === 'string') { | ||
return { result: result.replaceAll(' ', ' ') }; | ||
} | ||
|
||
return { result }; | ||
}; | ||
|
||
export default parseDocument; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters