Skip to content

Commit

Permalink
merge similar definitions (#100)
Browse files Browse the repository at this point in the history
  • Loading branch information
dwhieb authored Jan 30, 2022
1 parent c05372a commit 9278ba4
Show file tree
Hide file tree
Showing 7 changed files with 583 additions and 63 deletions.
19 changes: 10 additions & 9 deletions lib/aggregate/index.js
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
import createSpinner from 'ora';
import DatabaseIndex from '../utilities/DatabaseIndex.js';
import { fileURLToPath } from 'url';
import parseAnalysis from '../utilities/parseAnalysis.js';
import parseCategory from '../utilities/parseCategory.js';
import readNDJSON from '../utilities/readNDJSON.js';
import { Transducer } from 'hfstol';
import writeNDJSON from '../utilities/writeNDJSON.js';
import aggregateDefinitions from '../utilities/aggregateDefinitions.js';
import createSpinner from 'ora';
import DatabaseIndex from '../utilities/DatabaseIndex.js';
import { fileURLToPath } from 'url';
import parseAnalysis from '../utilities/parseAnalysis.js';
import parseCategory from '../utilities/parseCategory.js';
import readNDJSON from '../utilities/readNDJSON.js';
import { Transducer } from 'hfstol';
import writeNDJSON from '../utilities/writeNDJSON.js';

import {
dirname as getDirname,
Expand Down Expand Up @@ -249,7 +250,7 @@ function aggregateEntry(entry) {
}

// NOTE: Currently not displaying MD senses for programmatic matches.
// TODO: Use a bag-of-words approach to decide which MD senses to display.
entry.senses = aggregateDefinitions(entry.senses);

return entry;

Expand Down
62 changes: 8 additions & 54 deletions lib/convert/dlx2importjson.js
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import compare from '../utilities/compare.js';
import createSpinner from 'ora';
import fs from 'fs-extra';
import parseCategory from '../utilities/parseCategory.js';
import readNDJSON from '../utilities/readNDJSON.js';
import sortKeys from 'sort-keys';
import compare from '../utilities/compare.js';
import createSpinner from 'ora';
import fs from 'fs-extra';
import parseCategory from '../utilities/parseCategory.js';
import readNDJSON from '../utilities/readNDJSON.js';
import removeParentheticals from '../utilities/removeParentheticals.js';
import sortKeys from 'sort-keys';

const instrRegExp = /^by\s+/u;
const latinNoteRegExp = /[ei]\.\s*[eg]\./iu;
Expand All @@ -12,24 +13,6 @@ const latinNoteRegExp = /[ei]\.\s*[eg]\./iu;
// This prevents matches on things like "house(s)".
const parentheticalRegExp = /(?<parenthetical>\s+\(\s*(?<parenText>.+?)\s*\)\s*)/gu;

const EnglishPronouns = new Set([
`his/her own`,
`him/herself`,
`it as`,
`it/him`,
`it`,
`of it`,
`of something`,
`on s.t.`,
`s.o. as`,
`s.t.`,
`something`,
`that`,
`them`,
`to it/him`,
`to something`,
]);

const EnglishAbbrevs = [
`s\\.o\\.`,
`s\\.t\\.`,
Expand Down Expand Up @@ -143,7 +126,7 @@ function convertEntry({

const isPronoun = pos === `PrA`;
const displayDefinition = original;
const coreDefinition = createCoreDefinition(definition);
const coreDefinition = removeParentheticals(definition);

const semanticDefinition = createSemanticDefinition(definition, {
i,
Expand Down Expand Up @@ -191,35 +174,6 @@ function convertEntry({

}

function createCoreDefinition(definition) {

let coreDefinition = definition;
let match;

while ((match = parentheticalRegExp.exec(coreDefinition)) !== null) {

const { parenText, parenthetical } = match.groups;

// allow desired parentheticals
if (
instrRegExp.test(parenText) || // allow "by" phrases (instrumentals)
EnglishPronouns.has(parenText) // allow pronouns
) {
continue;
}

// remove all other parentheticals (including "e.g." and "i.e." parentheticals)
coreDefinition = coreDefinition.replace(parenthetical, ` `);

}

return coreDefinition
.replace(/\s{2,}/gu, ` `)
.replace(/\s+,/gu, `,`)
.trim();

}

function createSemanticDefinition(definition, { isPronoun, literalMeaning, notes, scientificName }) {

let semanticDefinition = definition;
Expand Down
109 changes: 109 additions & 0 deletions lib/utilities/aggregateDefinitions.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
/* eslint-disable
no-param-reassign,
*/

import normalizeDefinition from './normalizeDefinition.js';

const defaultThreshold = 0.8;

/**
* Compares two definitions and returns the minimum similarity value for the two definitions, between 0 and 1, 0 being completely dissimilar, and 1 being completely similar.
* @param {String} a The first definition.
* @param {String} b The second definition.
* @return {Number}
*/
function getSimilarity(a, b) {
const aWords = a.split(/\s+/gu); // list of words in definition A
const bWords = b.split(/\s+/gu); // list of words in definition B
const bothWords = intersection(aWords, bWords); // list of words contained in both definitions
const aOverlap = bothWords.length / aWords.length; // % of words in A that are also in B
const bOverlap = bothWords.length / bWords.length; // % of words in B that are also in A
return Math.min(aOverlap, bOverlap);
}

function intersection(a, b) {
const s = new Set(b);
return [...new Set(a)].filter(x => s.has(x));
}

/**
* Given a set of definitions, this function determines which definitions are sufficiently similar that one can be omitted.
* @param {Array} definitions An Array of definitions to compare. Each definition must be Object with `sources` and `definition` properties.
* @param {Object} [options={}] An options Object.
* @param {Array} [options.precedence] An Array of data sources in order of precedence. If omitted, the order of keys in the definitions object will be used.
* @param {Number} [options.threshold=0.8] The minimum threshold of matching words for two definitions to be considered the same, as a percentage between 0 and 1.
* @returns
*/
export default function aggregateDefinitions(definitions, options = {}) {

// NOTE: Don't alter the original definitions Array.
let { precedence } = options;

if (!precedence) {
precedence = Array.from(new Set(definitions.map(({ sources }) => sources).flat()));
}

for (const def of definitions) {

for (const source of def.sources) {
if (!precedence.includes(source)) {
throw new Error(`Unrecognized source: ${ source } in ${ def.definition }`);
}
}

def.normalized = normalizeDefinition(def.definition);

}

const threshold = options.threshold ?? defaultThreshold;
const mergedDefinitions = [];

for (const src of precedence) {

// get definitions for the current source
const defs = definitions.filter(({ sources }) => sources.includes(src));

// if there aren't yet any selected definitions, add the current ones
if (!mergedDefinitions.length) {
mergedDefinitions.push(...defs);
continue;
}

// for each definition from that source
defsLoop:
for (const currentDefinition of defs) {

// compare the definition to each selected definition
for (const existingDefinition of mergedDefinitions) {

if (currentDefinition === existingDefinition) continue;

const similarity = getSimilarity(currentDefinition.normalized, existingDefinition.normalized);
const isSimilar = similarity >= threshold;

// NOTE: Once a definition is deemed similar and merged with an existing definition,
// break out of the definitions loop (using the labeled break statement) so that no more processing is done for this definition.
if (isSimilar) {
existingDefinition.sources.push(...currentDefinition.sources);
continue defsLoop;
}

}

// NOTE: This conditional is only reached if the definition hasn't already been matched to an existing one.
// (See labeled break statement above.)
if (!mergedDefinitions.includes(currentDefinition)) {
mergedDefinitions.push(currentDefinition);
}

}

}

for (const definition of mergedDefinitions) {
delete definition.normalized;
}

return mergedDefinitions;

}
Loading

0 comments on commit 9278ba4

Please sign in to comment.