merge similar definitions (#100)

UAlbertaALTLab · Jan 30, 2022 · 9278ba4 · 9278ba4
1 parent c05372a
commit 9278ba4
Show file tree

Hide file tree

Showing 7 changed files with 583 additions and 63 deletions.
diff --git a/lib/aggregate/index.js b/lib/aggregate/index.js
@@ -1,11 +1,12 @@
-import createSpinner     from 'ora';
-import DatabaseIndex     from '../utilities/DatabaseIndex.js';
-import { fileURLToPath } from 'url';
-import parseAnalysis     from '../utilities/parseAnalysis.js';
-import parseCategory     from '../utilities/parseCategory.js';
-import readNDJSON        from '../utilities/readNDJSON.js';
-import { Transducer }    from 'hfstol';
-import writeNDJSON       from '../utilities/writeNDJSON.js';
+import aggregateDefinitions from '../utilities/aggregateDefinitions.js';
+import createSpinner        from 'ora';
+import DatabaseIndex        from '../utilities/DatabaseIndex.js';
+import { fileURLToPath }    from 'url';
+import parseAnalysis        from '../utilities/parseAnalysis.js';
+import parseCategory        from '../utilities/parseCategory.js';
+import readNDJSON           from '../utilities/readNDJSON.js';
+import { Transducer }       from 'hfstol';
+import writeNDJSON          from '../utilities/writeNDJSON.js';
 
 import {
   dirname as getDirname,
@@ -249,7 +250,7 @@ function aggregateEntry(entry) {
   }
 
   // NOTE: Currently not displaying MD senses for programmatic matches.
-  // TODO: Use a bag-of-words approach to decide which MD senses to display.
+  entry.senses = aggregateDefinitions(entry.senses);
 
   return entry;
 

diff --git a/lib/convert/dlx2importjson.js b/lib/convert/dlx2importjson.js
@@ -1,9 +1,10 @@
-import compare       from '../utilities/compare.js';
-import createSpinner from 'ora';
-import fs            from 'fs-extra';
-import parseCategory from '../utilities/parseCategory.js';
-import readNDJSON    from '../utilities/readNDJSON.js';
-import sortKeys      from 'sort-keys';
+import compare              from '../utilities/compare.js';
+import createSpinner        from 'ora';
+import fs                   from 'fs-extra';
+import parseCategory        from '../utilities/parseCategory.js';
+import readNDJSON           from '../utilities/readNDJSON.js';
+import removeParentheticals from '../utilities/removeParentheticals.js';
+import sortKeys             from 'sort-keys';
 
 const instrRegExp     = /^by\s+/u;
 const latinNoteRegExp = /[ei]\.\s*[eg]\./iu;
@@ -12,24 +13,6 @@ const latinNoteRegExp = /[ei]\.\s*[eg]\./iu;
 // This prevents matches on things like "house(s)".
 const parentheticalRegExp = /(?<parenthetical>\s+\(\s*(?<parenText>.+?)\s*\)\s*)/gu;
 
-const EnglishPronouns = new Set([
-  `his/her own`,
-  `him/herself`,
-  `it as`,
-  `it/him`,
-  `it`,
-  `of it`,
-  `of something`,
-  `on s.t.`,
-  `s.o. as`,
-  `s.t.`,
-  `something`,
-  `that`,
-  `them`,
-  `to it/him`,
-  `to something`,
-]);
-
 const EnglishAbbrevs = [
   `s\\.o\\.`,
   `s\\.t\\.`,
@@ -143,7 +126,7 @@ function convertEntry({
 
     const isPronoun         = pos === `PrA`;
     const displayDefinition = original;
-    const coreDefinition    = createCoreDefinition(definition);
+    const coreDefinition    = removeParentheticals(definition);
 
     const semanticDefinition = createSemanticDefinition(definition, {
       i,
@@ -191,35 +174,6 @@ function convertEntry({
 
 }
 
-function createCoreDefinition(definition) {
-
-  let coreDefinition = definition;
-  let match;
-
-  while ((match = parentheticalRegExp.exec(coreDefinition)) !== null) {
-
-    const { parenText, parenthetical } = match.groups;
-
-    // allow desired parentheticals
-    if (
-      instrRegExp.test(parenText) || // allow "by" phrases (instrumentals)
-      EnglishPronouns.has(parenText) // allow pronouns
-    ) {
-      continue;
-    }
-
-    // remove all other parentheticals (including "e.g." and "i.e." parentheticals)
-    coreDefinition = coreDefinition.replace(parenthetical, ` `);
-
-  }
-
-  return coreDefinition
-  .replace(/\s{2,}/gu, ` `)
-  .replace(/\s+,/gu, `,`)
-  .trim();
-
-}
-
 function createSemanticDefinition(definition, { isPronoun, literalMeaning, notes, scientificName }) {
 
   let semanticDefinition = definition;

diff --git a/lib/utilities/aggregateDefinitions.js b/lib/utilities/aggregateDefinitions.js
@@ -0,0 +1,109 @@
+/* eslint-disable
+  no-param-reassign,
+*/
+
+import normalizeDefinition from './normalizeDefinition.js';
+
+const defaultThreshold = 0.8;
+
+/**
+ * Compares two definitions and returns the minimum similarity value for the two definitions, between 0 and 1, 0 being completely dissimilar, and 1 being completely similar.
+ * @param  {String} a The first definition.
+ * @param  {String} b The second definition.
+ * @return {Number}
+ */
+function getSimilarity(a, b) {
+  const aWords    = a.split(/\s+/gu);                 // list of words in definition A
+  const bWords    = b.split(/\s+/gu);                 // list of words in definition B
+  const bothWords = intersection(aWords, bWords);     // list of words contained in both definitions
+  const aOverlap  = bothWords.length / aWords.length; // % of words in A that are also in B
+  const bOverlap  = bothWords.length / bWords.length; // % of words in B that are also in A
+  return Math.min(aOverlap, bOverlap);
+}
+
+function intersection(a, b) {
+  const s = new Set(b);
+  return [...new Set(a)].filter(x => s.has(x));
+}
+
+/**
+ * Given a set of definitions, this function determines which definitions are sufficiently similar that one can be omitted.
+ * @param {Array}  definitions             An Array of definitions to compare. Each definition must be Object with `sources` and `definition` properties.
+ * @param {Object} [options={}]            An options Object.
+ * @param {Array}  [options.precedence]    An Array of data sources in order of precedence. If omitted, the order of keys in the definitions object will be used.
+ * @param {Number} [options.threshold=0.8] The minimum threshold of matching words for two definitions to be considered the same, as a percentage between 0 and 1.
+ * @returns
+ */
+export default function aggregateDefinitions(definitions, options = {}) {
+
+  // NOTE: Don't alter the original definitions Array.
+  let { precedence } = options;
+
+  if (!precedence) {
+    precedence = Array.from(new Set(definitions.map(({ sources }) => sources).flat()));
+  }
+
+  for (const def of definitions) {
+
+    for (const source of def.sources) {
+      if (!precedence.includes(source)) {
+        throw new Error(`Unrecognized source: ${ source } in ${ def.definition }`);
+      }
+    }
+
+    def.normalized = normalizeDefinition(def.definition);
+
+  }
+
+  const threshold         = options.threshold ?? defaultThreshold;
+  const mergedDefinitions = [];
+
+  for (const src of precedence) {
+
+    // get definitions for the current source
+    const defs = definitions.filter(({ sources }) => sources.includes(src));
+
+    // if there aren't yet any selected definitions, add the current ones
+    if (!mergedDefinitions.length) {
+      mergedDefinitions.push(...defs);
+      continue;
+    }
+
+    // for each definition from that source
+    defsLoop:
+    for (const currentDefinition of defs) {
+
+      // compare the definition to each selected definition
+      for (const existingDefinition of mergedDefinitions) {
+
+        if (currentDefinition === existingDefinition) continue;
+
+        const similarity = getSimilarity(currentDefinition.normalized, existingDefinition.normalized);
+        const isSimilar  = similarity >= threshold;
+
+        // NOTE: Once a definition is deemed similar and merged with an existing definition,
+        // break out of the definitions loop (using the labeled break statement) so that no more processing is done for this definition.
+        if (isSimilar) {
+          existingDefinition.sources.push(...currentDefinition.sources);
+          continue defsLoop;
+        }
+
+      }
+
+      // NOTE: This conditional is only reached if the definition hasn't already been matched to an existing one.
+      // (See labeled break statement above.)
+      if (!mergedDefinitions.includes(currentDefinition)) {
+        mergedDefinitions.push(currentDefinition);
+      }
+
+    }
+
+  }
+
+  for (const definition of mergedDefinitions) {
+    delete definition.normalized;
+  }
+
+  return mergedDefinitions;
+
+}