Skip to content

Commit

Permalink
Merge pull request #645 from alpheios-project/i644-language-detect-ad…
Browse files Browse the repository at this point in the history
…apter

Add adapter for language detection
  • Loading branch information
irina060981 authored Apr 15, 2021
2 parents 01ce963 + 7dcd853 commit 1327d40
Show file tree
Hide file tree
Showing 27 changed files with 1,109 additions and 200 deletions.
175 changes: 165 additions & 10 deletions packages/client-adapters/dist/alpheios-client-adapters.js

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

175 changes: 165 additions & 10 deletions packages/client-adapters/dist/alpheios-client-adapters.node.js

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion packages/client-adapters/package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion packages/client-adapters/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
"test": "jest --coverage && shx cat ./coverage/lcov.info | coveralls",
"test-no-coverage": "jest",
"test-c": "jest tests/client-adapters.test.js",
"test-u": "jest tests/adapters/dtsapi/adapter.test.js",
"test-u": "jest tests/adapters/detectlang/adapter.test.js --coverage",
"test-l": "jest tests/adapters/logeion/adapter.test.js",
"test-a": "jest tests/adapters/arethusa/adapter.test.js",
"build": "npm run build-lib && npm run build-node",
Expand Down
10 changes: 10 additions & 0 deletions packages/client-adapters/src/adapters/adapters-config.json
Original file line number Diff line number Diff line change
Expand Up @@ -99,5 +99,15 @@
"getDocument": ["baseUrl", "id"]
}
}
},

"detectlangGroup": {
"detectlang": {
"adapter": "detectLangMethod",
"methods": [ "getDetectedLangsList" ],
"params": {
"getDetectedLangsList": ["text"]
}
}
}
}
83 changes: 83 additions & 0 deletions packages/client-adapters/src/adapters/detectlang/adapter.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
import BaseAdapter from '@clAdapters/adapters/base-adapter'
import DefaultConfig from '@clAdapters/adapters/detectlang/config.json'
import LangsList from '@clAdapters/adapters/detectlang/langs-list.json'

export default class DetectLangAdapter extends BaseAdapter {
/**
*
* @param {Object} config - properties for the adapter
*/
constructor (config = {}) {
super()
this.config = this.uploadConfig(config, DefaultConfig)
this.sourceData = config.sourceData
}

/**
*
* @param {String} text - text for analysis
* @returns {String} - langCode ISO 639-3 - a detected language
*/
async getDetectedLangsList (text) {
try {
const requestParams = {
method: 'POST',
headers: { Authorization: `Bearer ${this.config.api}` }
}

const url = this.getUrl(text)
if (!url) {
this.addError(this.l10n.getMsg('DETECT_LANG_URL_ERROR'))
return
}

let langsData
if (this.sourceData) {
langsData = this.sourceData
} else {
langsData = await this.fetch(url, { requestParams })
}
return this.chooseOneLanguage(langsData)
} catch (error) {
this.addError(this.l10n.getMsg('DETECT_LANG_FETCH_ERROR', { message: error.message }))
}
}

/**
*
* @param {String} text - text for analysis
* @returns {String} - constructed URL
*/
getUrl (text) {
if (text) {
return `${this.config.baseurl}?q=${encodeURIComponent(text)}`
}
return null
}

/**
* The remote service returns the following format
* { data: {
detections: [
{ language: 'en', isReliable: true, confidence: 3.36 },
{ language: 'pt', isReliable: false, confidence: 3.36 },
{ language: 'eu', isReliable: false, confidence: 3.36 }
]
}}
* We need return only one the most reliable languageCode in ISO 639-3 format
* @param {Object} langsData
* @returns {String|null} lang code in ISO 639-3
*/
chooseOneLanguage (langsData) {
if (langsData && langsData.data && langsData.data.detections && langsData.data.detections.length > 0) {
const reliableLangs = langsData.data.detections
.filter(langItem => langItem.isReliable)
if (reliableLangs && (reliableLangs.length > 0)) {
const lang = reliableLangs.sort((a, b) => a.confidence - b.confidence)
.reverse()[0].language
return LangsList[lang] ? LangsList[lang].langCode : lang
}
}
return null
}
}
4 changes: 4 additions & 0 deletions packages/client-adapters/src/adapters/detectlang/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"baseurl": "https://ws.detectlanguage.com/0.2/detect",
"api": "30e63bcc426af7b7c650aab568ed9ad7"
}
166 changes: 166 additions & 0 deletions packages/client-adapters/src/adapters/detectlang/langs-list.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
{
"aa": { "label": "Afar", "langCode": "aar" },
"ab": { "label": "Abkhazian", "langCode": "abk" },
"af": { "label": "Afrikaans", "langCode": "afr" },
"ak": { "label": "Akan", "langCode": "aka" },
"am": { "label": "Amharic", "langCode": "amh" },
"ar": { "label": "Arabic", "langCode": "ara" },
"as": { "label": "Assamese", "langCode": "asm" },
"ay": { "label": "Aymara", "langCode": "aym" },
"az": { "label": "Azerbaijani", "langCode": "aze" },
"ba": { "label": "Bashkir", "langCode": "bak" },
"be": { "label": "Belarusian", "langCode": "bel" },
"bg": { "label": "Bulgarian", "langCode": "bul" },
"bh": { "label": "Bihari", "langCode": "bih" },
"bi": { "label": "Bislama", "langCode": "bis" },
"bn": { "label": "Bengali", "langCode": "ben" },
"bo": { "label": "Tibetan", "langCode": "bod" },
"br": { "label": "Breton", "langCode": "bre" },
"bs": { "label": "Bosnian", "langCode": "bos" },
"bug": { "label": "Buginese", "langCode": "bug" },
"ca": { "label": "Catalan", "langCode": "cat" },
"ceb": { "label": "Cebuano", "langCode": "ceb" },
"chr": { "label": "Cherokee", "langCode": "chr" },
"co": { "label": "Corsican", "langCode": "cos" },
"crs": { "label": "Seselwa", "langCode": "crs" },
"cs": { "label": "Czech", "langCode": "ces" },
"cy": { "label": "Welsh", "langCode": "cym" },
"da": { "label": "Danish", "langCode": "dan" },
"de": { "label": "German", "langCode": "deu" },
"dv": { "label": "Dhivehi", "langCode": "div" },
"dz": { "label": "Dzongkha", "langCode": "dzo" },
"egy": { "label": "Egyptian", "langCode": "egy" },
"el": { "label": "Greek", "langCode": "grc" },
"en": { "label": "English", "langCode": "eng" },
"eo": { "label": "Esperanto", "langCode": "epo" },
"es": { "label": "Spanish", "langCode": "spa" },
"et": { "label": "Estonian", "langCode": "est" },
"eu": { "label": "Basque", "langCode": "eus" },
"fa": { "label": "Persian", "langCode": "per" },
"fi": { "label": "Finnish", "langCode": "fin" },
"fj": { "label": "Fijian", "langCode": "fij" },
"fo": { "label": "Faroese", "langCode": "fao" },
"fr": { "label": "French", "langCode": "fra" },
"fy": { "label": "Frisian", "langCode": "frr" },
"ga": { "label": "Irish", "langCode": "gle" },
"gd": { "label": "Scots Gaelic", "langCode": "gla" },
"gl": { "label": "Galician", "langCode": "glg" },
"gn": { "label": "Guarani", "langCode": "grn" },
"got": { "label": "Gothic", "langCode": "got" },
"gu": { "label": "Gujarati", "langCode": "guj" },
"gv": { "label": "Manx", "langCode": "glv" },
"ha": { "label": "Hausa", "langCode": "hau" },
"haw": { "label": "Hawaiian", "langCode": "haw" },
"hi": { "label": "Hindi", "langCode": "hin" },
"hmn": { "label": "Hmong", "langCode": "hmn" },
"hr": { "label": "Croatian", "langCode": "hrv" },
"ht": { "label": "Haitian Creole", "langCode": "hat" },
"hu": { "label": "Hungarian", "langCode": "hun" },
"hy": { "label": "Armenian", "langCode": "hye" },
"ia": { "label": "Interlingua", "langCode": "ina" },
"id": { "label": "Indonesian", "langCode": "ind" },
"ie": { "label": "Interlingue", "langCode": "ile" },
"ig": { "label": "Igbo", "langCode": "ibo" },
"ik": { "label": "Inupiaq", "langCode": "ipk" },
"is": { "label": "Icelandic", "langCode": "isl" },
"it": { "label": "Italian", "langCode": "ita" },
"iu": { "label": "Inuktitut", "langCode": "iku" },
"iw": { "label": "Hebrew", "langCode": "heb" },
"ja": { "label": "Japanese", "langCode": "jpn" },
"jw": { "label": "Javanese", "langCode": "jav" },
"ka": { "label": "Georgian", "langCode": "kat" },
"kha": { "label": "Khasi", "langCode": "kha" },
"kk": { "label": "Kazakh", "langCode": "kaz" },
"kl": { "label": "Greenlandic", "langCode": "kal" },
"km": { "label": "Khmer", "langCode": "khm" },
"kn": { "label": "Kannada", "langCode": "kan" },
"ko": { "label": "Korean", "langCode": "kor" },
"ks": { "label": "Kashmiri", "langCode": "kas" },
"ku": { "label": "Kurdish", "langCode": "kur" },
"ky": { "label": "Kyrgyz", "langCode": "kir" },
"la": { "label": "Latin", "langCode": "lat" },
"lb": { "label": "Luxembourgish", "langCode": "ltz" },
"lg": { "label": "Ganda", "langCode": "lug" },
"li": { "label": "Limbu", "langCode": "lim" },
"ln": { "label": "Lingala", "langCode": "lin" },
"lo": { "label": "Laothian", "langCode": "lao" },
"lt": { "label": "Lithuanian", "langCode": "lit" },
"lv": { "label": "Latvian", "langCode": "lav" },
"mfe": { "label": "Mauritian Creole", "langCode": "mfe" },
"mg": { "label": "Malagasy", "langCode": "mlg" },
"mi": { "label": "Maori", "langCode": "mao" },
"mk": { "label": "Macedonian", "langCode": "mac" },
"ml": { "label": "Malayalam", "langCode": "mal" },
"mn": { "label": "Mongolian", "langCode": "mon" },
"mr": { "label": "Marathi", "langCode": "mar" },
"ms": { "label": "Malay", "langCode": "msa" },
"mt": { "label": "Maltese", "langCode": "mlt" },
"my": { "label": "Burmese", "langCode": "mya" },
"na": { "label": "Nauru", "langCode": "nau" },
"ne": { "label": "Nepali", "langCode": "nep" },
"nl": { "label": "Dutch", "langCode": "nld" },
"no": { "label": "Norwegian", "langCode": "nor" },
"nr": { "label": "Ndebele", "langCode": "nbl" },
"nso": { "label": "Pedi", "langCode": "nso" },
"ny": { "label": "Nyanja", "langCode": "nya" },
"oc": { "label": "Occitan", "langCode": "oci" },
"om": { "label": "Oromo", "langCode": "orm" },
"or": { "label": "Oriya", "langCode": "ori" },
"pa": { "label": "Punjabi", "langCode": "pan" },
"pl": { "label": "Polish", "langCode": "pol" },
"ps": { "label": "Pashto", "langCode": "pus" },
"pt": { "label": "Portuguese", "langCode": "por" },
"qu": { "label": "Quechua", "langCode": "que" },
"rm": { "label": "Rhaeto Romance", "langCode": "roh" },
"rn": { "label": "Rundi", "langCode": "run" },
"ro": { "label": "Romanian", "langCode": "ron" },
"ru": { "label": "Russian", "langCode": "rus" },
"rw": { "label": "Kinyarwanda", "langCode": "kin" },
"sa": { "label": "Sanskrit", "langCode": "san" },
"sco": { "label": "Scots", "langCode": "sco" },
"sd": { "label": "Sindhi", "langCode": "snd" },
"sg": { "label": "Sango", "langCode": "sag" },
"si": { "label": "Sinhalese", "langCode": "sin" },
"sk": { "label": "Slovak", "langCode": "slk" },
"sl": { "label": "Slovenian", "langCode": "slv" },
"sm": { "label": "Samoan", "langCode": "smo" },
"sn": { "label": "Shona", "langCode": "sna" },
"so": { "label": "Somali", "langCode": "som" },
"sq": { "label": "Albanian", "langCode": "sqi" },
"sr": { "label": "Serbian", "langCode": "srp" },
"ss": { "label": "Siswant", "langCode": "ssw" },
"st": { "label": "Sesotho", "langCode": "sot" },
"su": { "label": "Sundanese", "langCode": "sun" },
"sv": { "label": "Swedish", "langCode": "swe" },
"sw": { "label": "Swahili", "langCode": "swa" },
"syr": { "label": "Syriac", "langCode": "syr" },
"ta": { "label": "Tamil", "langCode": "tam" },
"te": { "label": "Telugu", "langCode": "tel" },
"tg": { "label": "Tajik", "langCode": "tgk" },
"th": { "label": "Thai", "langCode": "tha" },
"ti": { "label": "Tigrinya", "langCode": "tir" },
"tk": { "label": "Turkmen", "langCode": "tuk" },
"tl": { "label": "Tagalog", "langCode": "tgl" },
"tlh": { "label": "Klingon", "langCode": "tlh" },
"tn": { "label": "Tswana", "langCode": "tsn" },
"to": { "label": "Tonga", "langCode": "tog" },
"tr": { "label": "Turkish", "langCode": "tur" },
"ts": { "label": "Tsonga", "langCode": "tso" },
"tt": { "label": "Tatar", "langCode": "tat" },
"ug": { "label": "Uighur", "langCode": "uig" },
"uk": { "label": "Ukrainian", "langCode": "ukr" },
"ur": { "label": "Urdu", "langCode": "urd" },
"uz": { "label": "Uzbek", "langCode": "uzb" },
"ve": { "label": "Venda", "langCode": "ven" },
"vi": { "label": "Vietnamese", "langCode": "vie" },
"vo": { "label": "Volapuk", "langCode": "vol" },
"war": { "label": "Waray Philippines", "langCode": "war" },
"wo": { "label": "Wolof", "langCode": "wol" },
"xh": { "label": "Xhosa", "langCode": "xho" },
"yi": { "label": "Yiddish", "langCode": "yid" },
"yo": { "label": "Yoruba", "langCode": "yor" },
"za": { "label": "Zhuang", "langCode": "zha" },
"zh": { "label": "Chinese Simplified", "langCode": "zho" },
"zh-Hant": { "label": "Chinese Traditional", "langCode": "zho" },
"zu": { "label": "Zulu", "langCode": "zul" }
}
31 changes: 31 additions & 0 deletions packages/client-adapters/src/client-adapters.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import ArethusaTreebankAdapter from '@clAdapters/adapters/arethusa/adapter'
import AlpheiosLogeionAdapter from '@clAdapters/adapters/logeion/adapter'
import AlpheiosTokenizationAdapter from '@clAdapters/adapters/tokenization/adapter'
import DTSAPIAdapter from '@clAdapters/adapters/dtsapi/adapter'
import DetectLangAdapter from '@clAdapters/adapters/detectlang/adapter'

import WrongMethodError from '@clAdapters/errors/wrong-method-error'
import NoRequiredParamError from '@clAdapters/errors/no-required-param-error'
Expand Down Expand Up @@ -93,6 +94,11 @@ class ClientAdapters {
return cachedAdaptersList.get('dtsapiGroup')
}

static get detectlangGroup () {
ClientAdapters.init()
return cachedAdaptersList.get('detectlangGroup')
}

/**
* This method checks if given method is registered in config for category.adapterName
* @param {String} category - category name - morphology, lemmatranslation, lexicon
Expand Down Expand Up @@ -427,6 +433,10 @@ class ClientAdapters {
return null
}

/**
* It is used for getting TEI texts from DTS API
* @param {Object} options
*/
static async dtsApiMethod (options) {
ClientAdapters.checkMethodParam('dtsapiGroup', 'dtsapi', options)

Expand All @@ -453,6 +463,27 @@ class ClientAdapters {
return { result: res, errors: localDTSAPIAdapter.errors }
}
}

/**
* It is used for detecting language by text
* @param {Object} options
*/
static async detectLangMethod (options) {
ClientAdapters.checkMethodParam('detectlangGroup', 'detectlang', options)

const localDetectLangAdapter = new DetectLangAdapter({
category: 'detectlangGroup',
adapterName: 'detectlang',
method: options.method,
clientId: options.clientId,
sourceData: options.params.sourceData
})

if (options.method === 'getDetectedLangsList') {
const res = await localDetectLangAdapter.getDetectedLangsList(options.params.text)
return { result: res, errors: localDetectLangAdapter.errors }
}
}
}

export default ClientAdapters
11 changes: 11 additions & 0 deletions packages/client-adapters/src/locales/en-us/messages.json
Original file line number Diff line number Diff line change
Expand Up @@ -204,5 +204,16 @@
"description": "Error message for DTS API adapter - problems with fetching words from api",
"component": "dtsapiGroup.general",
"params": ["message"]
},
"DETECT_LANG_URL_ERROR": {
"message": "There are not enough parameters for detect language request",
"description": "Error message - no apikey and baseurl for Alpheios Tokenization API",
"component": "detectlangGroup.detectlang"
},
"DETECT_LANG_FETCH_ERROR": {
"message": "Some problems with detection language request API - {message}",
"description": "Error message for DetectLang adapter",
"component": "detectlangGroup.detectlang",
"params": ["message"]
}
}
Loading

0 comments on commit 1327d40

Please sign in to comment.