feat: add replace special characters option to parse document

bettyblocks · Nov 21, 2024 · ea674a2 · ea674a2
1 parent 730e0e1
commit ea674a2
Show file tree

Hide file tree

Showing 6 changed files with 162 additions and 6 deletions.
diff --git a/__tests__/parse-document/1.1/index.test.js b/__tests__/parse-document/1.1/index.test.js
@@ -0,0 +1,54 @@
+import parseDocument from '../../../functions/parse-document/1.1';
+
+describe('Parse document', () => {
+  test('It returns a string based on a URL', async () => {
+    const { result } = await parseDocument({
+      document: 'https://www.example.com/document.pdf',
+      density: 300,
+      forceImage: false,
+    });
+
+    expect(result).toEqual('Dummy result');
+  });
+
+  test('It returns a string based on a file property', async () => {
+    const { result } = await parseDocument({
+      document: {
+        name: 'document.pdf',
+        url: 'https://www.example.com/document.pdf',
+      },
+      density: 300,
+      forceImage: false,
+    });
+
+    expect(result).toEqual('Dummy result');
+  });
+
+  test('It replaces special sequence of characters to spaces if enabled', async () => {
+    const { result } = await parseDocument({
+      document: {
+        name: 'special-document.pdf',
+        url: 'https://www.example.com/special-document.pdf',
+      },
+      density: 300,
+      forceImage: false,
+      removeSpecialCharacters: true,
+    });
+
+    expect(result).toEqual('Dummy result with special characters');
+  });
+
+  test("It doesn't replaces special sequence of characters to spaces if disabled", async () => {
+    const { result } = await parseDocument({
+      document: {
+        name: 'special-document.pdf',
+        url: 'https://www.example.com/special-document.pdf',
+      },
+      density: 300,
+      forceImage: false,
+      removeSpecialCharacters: false,
+    });
+
+    expect(result).toEqual('Dummy&nbsp;result with special&nbsp;characters');
+  });
+});
diff --git a/__tests__/support/document-parser/index.js b/__tests__/support/document-parser/index.js
@@ -1,5 +1,13 @@
-const documentParser = async (_) => ({
-  result: 'Dummy result',
-});
+const documentParser = async ({ document }) => {
+  if (document === 'https://www.example.com/special-document.pdf') {
+    return {
+      result: 'Dummy&nbsp;result with special&nbsp;characters',
+    };
+  }
+
+  return {
+    result: 'Dummy result',
+  };
+};
 
 export default documentParser;
diff --git a/blocks/parse-document.json b/blocks/parse-document.json
@@ -1,5 +1,5 @@
 {
   "dependencies": [],
-  "functions": ["parseDocument 1.0"],
+  "functions": ["parseDocument 1.1"],
   "includes": []
 }
diff --git a/functions/parse-document/1.1/function.json b/functions/parse-document/1.1/function.json
@@ -0,0 +1,69 @@
+{
+  "description": "Extract and convert textual information from various document formats, including docx, pptx, xlsx, odt, odp, ods, and pdf files.",
+  "label": "Parse Document to Text",
+  "category": "Misc",
+  "icon": {
+    "name": "FileIcon",
+    "color": "Orange"
+  },
+  "options": [
+    {
+      "meta": {
+        "type": "Value",
+        "allowedKinds": ["STRING", "FILE"],
+        "validations": {
+          "required": true
+        }
+      },
+      "name": "document",
+      "label": "Document",
+      "info": "Specify the document to be parsed. This can be a file property or an URL."
+    },
+    {
+      "meta": {
+        "type": "Number",
+        "default": 300,
+        "validations": {
+          "required": true
+        }
+      },
+      "advanced": true,
+      "name": "density",
+      "label": "Density",
+      "info": "This option specifies the image resolution. The higher the density, the better the quality of the output will be. However, higher density also means slower processing."
+    },
+    {
+      "meta": {
+        "type": "Boolean"
+      },
+      "advanced": true,
+      "name": "forceImage",
+      "label": "Force image",
+      "info": "Force the document to be scanned as an image. Sometimes this can result in a better output."
+    },
+    {
+      "meta": {
+        "type": "Boolean",
+        "default": true
+      },
+      "advanced": true,
+      "name": "removeSpecialCharacters",
+      "label": "Remove Special Characters",
+      "info": "Remove special characters from the output."
+    },
+    {
+      "name": "result",
+      "label": "Result",
+      "meta": {
+        "type": "Output",
+        "validations": {
+          "required": true
+        },
+        "output": {
+          "type": "Text"
+        }
+      }
+    }
+  ],
+  "yields": "NONE"
+}
diff --git a/functions/parse-document/1.1/index.js b/functions/parse-document/1.1/index.js
@@ -0,0 +1,24 @@
+const isFileProperty = (value) =>
+  value && typeof value === 'object' && 'url' in value;
+
+const parseDocument = async ({
+  document,
+  density,
+  forceImage,
+  removeSpecialCharacters,
+}) => {
+  const url = isFileProperty(document) ? document?.url : document;
+
+  const { result } = await documentParser({
+    document: url,
+    parserOptions: { density, forceImage },
+  });
+
+  if (removeSpecialCharacters && typeof result === 'string') {
+    return { result: result.replaceAll('&nbsp;', ' ') };
+  }
+
+  return { result };
+};
+
+export default parseDocument;
diff --git a/package.json b/package.json
@@ -5,8 +5,9 @@
     "prepare": "husky install",
     "lint": "eslint",
     "prettier:base": "prettier --single-quote",
-    "prettier:check": "yarn prettier:base --list-different \"{functions,__tests__,isolate-tests}/**/*.js\"",
-    "prettier:write": "yarn prettier:base --write \"{functions,__tests__,isolate-tests}/**/*.js\"",
+    "prettier:check": "pnpm run prettier:base --list-different \"{functions,__tests__,isolate-tests}/**/*.js\"",
+    "prettier:write": "pnpm run prettier:base --write \"{functions,__tests__,isolate-tests}/**/*.js\"",
+    "fmt": "pnpm run --silent prettier:write",
     "test": "vitest run",
     "test:isolate": "bb functions test isolate-tests/",
     "publish": "bb blocks publish"