full test coverage

cldf · Nov 8, 2024 · da61f9b · da61f9b
1 parent 4db7e44
commit da61f9b
Show file tree

Hide file tree

Showing 41 changed files with 1,647 additions and 56 deletions.
diff --git a/src/linglit/__init__.py b/src/linglit/__init__.py
@@ -39,9 +39,6 @@ def iter_examples(d='.', glottolog='glottolog', **dirs):  # pragma: no cover
     c = collections.Counter()
     glottolog = Glottolog(glottolog)
     for rid, cls in PROVIDERS.items():
-        # if rid != 'glossa':
-        if rid != 'langsci':
-            continue
         sd = dirs.get(rid, d / rid)
         bibtex = sd / 'bibtex'
         if sd.exists():
@@ -66,7 +63,6 @@ def iter_examples(d='.', glottolog='glottolog', **dirs):  # pragma: no cover
                 bibtex.joinpath(
                     '{}.bib'.format(pub.record.ID)).write_text('\n\n'.join(t), encoding='utf8')
 
-                continue
                 pid = '{}-{}'.format(rid, pub.record.ID)
                 for i, ex in enumerate(pub.iter_examples()):
                     if ex.Language_ID is None:

diff --git a/src/linglit/base/__init__.py b/src/linglit/base/__init__.py
@@ -110,7 +110,7 @@ def __attrs_post_init__(self):
                 cmt = cmt.strip()
                 if len(cmt.split()) > 1:
                     if self.Comment:
-                        self.Comment += ';'
+                        self.Comment += '; '
                     self.Comment = (self.Comment or '') + cmt
                 else:
                     self.Corpus_Ref = cmt

diff --git a/src/linglit/cfg/langsci/texfile_titles.tsv b/src/linglit/cfg/langsci/texfile_titles.tsv
@@ -788,6 +788,7 @@ Book_ID	Filename	Language	Title
 280	simik.tex		Inherent vs. accidental uniqueness in bare and demonstrative nominals
 280	zimmermann.tex		The role of the correlate in clause-embedding
 280	zivojinovic.tex	Torlak	Torlak clitic doubling: A cross-linguistic comparison
+282	bliss.tex	siks1238	
 282	diessel.tex		A typology of demonstrative clause linkers
 282	fuchs.tex		Referential shift potential of demonstrative pronouns – Evidence from text continuation
 282	johannessen.tex		Psychologically distal demonstratives in Scandinavian are not “discourse new”
@@ -871,6 +872,21 @@ Book_ID	Filename	Language	Title
 290	08.tex		Building bridges between languages: How students develop crosslinguistic awareness in multilingual learning settings
 290	09.tex		Students’ perceptions of plurilingual nonnative teachers in higher education: An added or a mudded value?
 290	prefaceEd.tex		Preface
+291	02-Cupik.tex	cent2127	Constituency in Cup'ik and the problem of holophrasis
+291	03-Cherokee.tex	cher1273	Constituency in Oklahoma Cherokee
+291	04-Kiowa.tex	kiow1266	Constituency and Wordhood in Kiowa
+291	05-Mazatec.tex	ayau1235	Constituency in Ayautla Mazatec
+291	06-Mixtec.tex	juxt1235	Constituency in Tù'un Ntá'ví (Mixtec) of San Martín Duraznos
+291	07-Zapotec.tex	teot1238	Words as emergent constituents in Teotitlán del Valle Zapotec
+291	08-Chatino.tex	zenz1235	Constituency in Zenzontepec Chatino
+291	09-Martinican.tex	mart1259	Constituency in Martinican (creole, Martinique)
+291	10-Hup.tex	hupd1244	Constituency in Hup: Synchronic and diachronic perspectives
+291	11-Yukuna.tex	yucu1253	Constituency in Yukuna
+291	12-Mebengokre.tex	kaya1330	Constituency in Mẽbêngôkre independent clauses
+291	13-Araona.tex	arao1248	Graded constituency in the Araona (Takana) verb complex
+291	14-Quechua.tex	sout2991	Word structure and constituency in Uma Piwra South Bolivian Quechua
+291	15-Chorote.tex	iyoj1235	Wordhood in Chorote (Mataguayan)
+291	16-Mocovi.tex	moco1246	Constituency in Northern Chaco Mocoví (Guaycuruan, Argentina)
 293	avatime.tex	Avatime	A note on  wh -questions in Avatime
 293	barzlai.tex	Nobiin	Morphologically conditioned phonological variation in Nobiin
 293	bukusumu.tex	Lubukusu	Object marking in Lubukusu: Information structure in the verb phrase
@@ -1015,3 +1031,19 @@ Book_ID	Filename	Language	Title
 329	07.tex		Agreement inflection and word order in Viskadalian Swedish
 329	08.tex		From ‘big’ to ‘much’ From ‘big’ to ‘much’: On the grammaticalization of two gradable adjectives in Swedish
 329	prefaceEd.tex		Preface with an editor, abstract and citation footer
+383	kahigi.tex	sumb1240	Verb extensions and morphosyntactic variation in Bantu: The case of Sumbwa
+383	lukusa.tex	luba1249	A morphosyntactic study of verb object marking in Čilubà
+383	lusekelo.tex	nyak1261	Concord and agreement in Eastern Bantu: The augment and noun classes in Nyakyusa
+383	mallya.tex	bosh1240	The morphosyntax of locative expressions in Kiwoso
+383	ngwasi.tex	hehe1240	The historical development of the reflexive-reciprocal polysemy in Hehe
+383	taji.tex	yaoo1241	Demonstratives in Chiyao: An analysis of their form, distribution and functions
+383	yoneda.tex	gand1255	Multiple-object constructions in Ganda
+411	01-Arsenijevic.tex	sout1528	Specification of telicity in Serbo-Croatian, without null prefixes
+411	06-Georgieva.tex	bulg1262	Inflectionless adjectives in Bulgarian as a case of nominal predication
+411	08-Geist.tex	russ1263	Responding to negative biased questions in Russian
+411	09-Matushansky.tex	russ1263	Responding to negative biased questions in Russian
+411	12-Stepanov.tex	slov1268	Number mismatch effect and processing cataphora in a \textit{pro}-drop language: The case of Slovenian
+440	01.tex	Czech	LEMUR: A lexicon of Czech multiword expressions
+440	02.tex	poma1238	Description of Pomak within IDION: Challenges in the representation of verb multiword expressions
+440	07.tex	Dutch	MWE-Finder: Querying for multiword expressions in large Dutch text corpora
+440	09.tex	Swedish	Multiword expressions in Swedish as a second language: Taxonomy, annotation, and initial results
diff --git a/src/linglit/cldf/publication.py b/src/linglit/cldf/publication.py
@@ -24,7 +24,7 @@ def languages(self):
     def iter_references(self):
         sid2langs = collections.defaultdict(set)
         if not self.cfg.bib:
-            return
+            return  # pragma: no cover
         s2l = self.cfg.source_to_language
         l2gc = {}
         for row in self.languages:

diff --git a/src/linglit/cldf/repository.py b/src/linglit/cldf/repository.py
@@ -89,7 +89,7 @@ def create(self, verbose=False):
             if dldir.exists():
                 if self.metadata(did)['version'] == rec.version:
                     continue
-                shutil.rmtree(dldir)
+                shutil.rmtree(dldir)  # pragma: no cover
             print('downloading {} ...'.format(rec.version))
             rec.download_dataset(self.dir / did)
             print('... done')

diff --git a/src/linglit/commands/mergedbib.py b/src/linglit/commands/mergedbib.py
@@ -24,9 +24,9 @@ def bibtex(src):
     with TemporaryDirectory() as tmp:
         for pub in tqdm(repos.iter_publications()):
             if pub.id == 'langsci{}'.format(args.drop_until):
-                do = True
+                do = True  # pragma: no cover
             if not do:
-                continue
+                continue  # pragma: no cover
             with tmp.joinpath('{}.bib'.format(pub.id)).open('w') as bib:
                 bib.write(bibtex(pub.as_source()))
                 for src in pub.cited_references:

diff --git a/src/linglit/glossa/xml.py b/src/linglit/glossa/xml.py
@@ -150,8 +150,8 @@ def iter_igt(d, abbrs):
             numbers = [
                 t(li.xpath('list-item')[0])
                 for li in gloss.xpath(".//list[@list-type='wordfirst']")]
-        except IndexError:
-            continue
+        except IndexError:  # pragma: no cover
+            continue  # Something isn't as expected. We just skip this potential example.
         for n in numbers:
             m = re.match(r'\(([0-9]+|[iv]+)\)', n)
             if m:

diff --git a/src/linglit/langsci/examples.py b/src/linglit/langsci/examples.py
@@ -1,7 +1,7 @@
-import functools
 import re
 import typing
 import hashlib
+import functools
 
 from pyigt.igt import IGT, NON_OVERT_ELEMENT
 from pyigt.lgrmorphemes import MORPHEME_SEPARATORS
@@ -47,7 +47,7 @@ def parse_cmd(cmd, line):
         # Cut out the command and its first argument from line:
         texcmd = '\\' + cmd + '{' + line.split('\\' + cmd + '{')[-1].split('}')[0]
         cmd = getattr(TexSoup(texcmd, tolerance=1), cmd)
-    except:  # noqa: E722
+    except:  # pragma: no cover # noqa: E722
         raise ValueError(line)  # pragma: no cover
     return (cmd.args[0].string.split('!')[-1], '', '')
 
@@ -252,7 +252,7 @@ def lines_and_comment(lines):
                     comment.append(s.jambox.string)
                     s.jambox.delete()
                     line = str(s)
-            except:  # noqa: E722
+            except:  # pragma: no cover # noqa: E722
                 pass  # pragma: no cover
             if line:
                 res.append(line)
@@ -282,7 +282,8 @@ def lines_and_comment(lines):
                         to_text(res[-1].split('\n')[0])[0].strip())
                     if m:
                         if m.groups()[0][0].isalpha() and m.groups()[0][0].islower():
-                            linfo = (m.groups()[0], '', '')
+                            # Hm. Seems to be impossible given the regex.
+                            linfo = (m.groups()[0], '', '')  # pragma: no cover
                         else:
                             comment.append(m.groups()[0])
                         res = res[:-1]
@@ -318,7 +319,7 @@ def make_example(
     aligned = [line.strip() for line in re.split(r'\\(?:\\|newline)', aligned) if line.strip()]
 
     # book-specifics:
-    if pub.record.int_id == 212:
+    if pub.record.int_id == 212:  # pragma: no cover
         if len(aligned) > 2:
             if 'footnotesize' in aligned[2]:
                 aligned = aligned[:2]
@@ -347,7 +348,7 @@ def make_example(
         pt, gl = aligned
         obj = None
     elif len(aligned):
-        if len(aligned) == 4 and aligned[3].startswith(r'}\\jambox'):
+        if len(aligned) == 4 and aligned[3].startswith(r'}\jambox'):
             obj, pt = aligned[0], aligned[1]
             gl = aligned[2] + aligned[3]
         else:  # Dunno what to do here ...
@@ -356,7 +357,7 @@ def make_example(
             # print('---')
             return
     else:  # ... or here.
-        return
+        return  # pragma: no cover
     if obj:
         obj, cmt, _refs = to_text(obj)
         if _refs:
@@ -370,8 +371,8 @@ def make_example(
     if len(pt) != len(gl):
         if gl and gl[-1] in ['()', '*()']:
             gl = gl[:-1]
-    if len(pt) != len(gl):
-        return
+    if len(pt) != len(gl):  # Primary text cannot be aligned with glosses.
+        return  # pragma: no cover
 
     obj = obj or IGT(phrase=pt, gloss=gl).primary_text
     return Example(

diff --git a/src/linglit/langsci/latex.py b/src/linglit/langsci/latex.py
@@ -798,7 +798,7 @@
 def uppercase_arg(n, l2tobj):
     if n.nodeargd:
         return l2tobj.nodelist_to_text([n.nodeargd.argnlist[0]]).upper()
-    return ''
+    return ''  # pragma: no cover
 
 
 def dot_uppercase_arg(n, l2tobj):
@@ -829,7 +829,7 @@ def secondarg(n, l2tobj):
 
 
 def repl(abbr, *args):
-    return abbr
+    return abbr  # pragma: no cover
 
 
 def japhug(n, l2tobj):
@@ -871,19 +871,19 @@ def cite(n, l2tobj):
         # n.nodeargd can be empty if e.g. \putinquotes was a single
         # token passed as an argument to a macro,
         # e.g. \newcommand\putinquotes...
-        return ''
+        return ''  # pragma: no cover
     page = ''
     if len(n.nodeargd.argnlist) > 1:
         page = _get_optional_arg(n.nodeargd.argnlist[0], '', l2tobj)
     key = l2tobj.nodelist_to_text([n.nodeargd.argnlist[-1]]).strip().replace('   ', '&')
     if key:
         return '<cit page="{}">{}</cit>'.format(page.replace('"', ''), key)
-    return ''
+    return ''  # pragma: no cover
 
 
 def langinfo(n, l2tobj):
     if not n.nodeargd:
-        return ''
+        return ''  # pragma: no cover
     res = ''
     for i, arg in enumerate(n.nodeargd.argnlist):
         t = l2tobj.nodelist_to_text([arg]).strip()
@@ -990,7 +990,7 @@ def custom_latex_to_text(input_latex, parser=lw_context_db, converter=l2t_contex
     # convert to text
     try:
         return l2t_obj.nodelist_to_text(nodelist)
-    except (IndexError, ValueError):
+    except (IndexError, ValueError):  # pragma: no cover
         return input_latex
 
 
@@ -1034,20 +1034,20 @@ def to_text(latex):
 
     # extract citations:
     pattern = re.compile(r'<cit page="([^"]*)">([^<]+)</cit>')
-    for m in pattern.finditer(text):
-        if m.groups()[1] != '[':
-            for sid in m.groups()[1].split(','):
-                if sid.strip():
-                    refs.append((sid.strip(), m.groups()[0]))
-    if refs:
-        text = pattern.sub('', text).strip()
 
-    for cc in comment:
-        for m in pattern.finditer(cc):
+    def find_refs(t):
+        for m in pattern.finditer(t):
             if m.groups()[1] != '[':
                 for sid in m.groups()[1].split(','):
                     if sid.strip():
                         refs.append((sid.strip(), m.groups()[0]))
+
+    find_refs(text)
+    if refs:
+        text = pattern.sub('', text).strip()
+
+    for cc in comment:
+        find_refs(cc)
     comment = [pattern.sub(lambda m: m.groups()[1], cc).strip() for cc in comment]
 
     #

diff --git a/src/linglit/langsci/publication.py b/src/linglit/langsci/publication.py
@@ -239,7 +239,7 @@ def norm_include(s):
             p = m.get(p.name.lower(), p)
             if not p.exists() and (p.stem in ['preface', 'acknowledgments']):
                 continue
-            if not p.exists() and p.stem == 'abbreviations':
+            if not p.exists() and p.stem == 'abbreviations':  # pragma: no cover
                 if p.parent.parent.joinpath('abbreviations.tex').exists():
                     p = p.parent.parent.joinpath('abbreviations.tex')
             assert p.exists(), str(p)

diff --git a/src/linglit/langsci/repository.py b/src/linglit/langsci/repository.py
@@ -21,13 +21,27 @@
 CATALOG_NAME = "catalog.tsv"
 FILELIST_NAME = "files.json"
 MISSING_TEX_SOURCES = [
-    155, 192, 195, 255, 287, 297, 311, 325, 373, 380,
-    410,
-    284,  # For the time being ...
-    292,
+    155,
+    192,
+    195,
+    255,
+    287,
+    297,
+    311,
+    325,
+    373,
+    380,
+    # 410,
+    284,  # For the time being ... no main file found
+    # 292,
     438,
 ]
-MISSING_REPOS = [410, 389, 392, 393, 438]
+MISSING_REPOS = [  # Some publications don't have a public repository (yet).
+    410,
+    389,
+    # 392,
+    # 393,
+    438]
 TEX_BRANCH = {187: 'master'}
 
 
@@ -87,10 +101,8 @@ def __getitem__(self, item):
 
     def iter_publications(self):
         for item in self.catalog:
-            # if item.int_id != 22:
-            #     continue
             if item.int_id in MISSING_REPOS:
-                continue
+                continue  # pragma: no cover
             if item.int_id not in MISSING_TEX_SOURCES:
                 yield Publication(item, self.dir / item.ID, self)
 

diff --git a/tests/cldf/catalog.csv b/tests/cldf/catalog.csv
@@ -1,2 +1,3 @@
 id,name,conceptdoi,bib,source_to_language,hhtype,igt,gloss_abbreviations
-8,uratyp,10.5281/zenodo.5236365,1,LanguageTable,,1,
+8,uratyp,10.5281/zenodo.5236365,1,LanguageTable,grammar,1,ga.csv abbr def
+9,petersonsouthasia,10.5281/zenodo.5236365,1,ValueTable,,,
diff --git a/tests/cldf/petersonsouthasia.json b/tests/cldf/petersonsouthasia.json
@@ -0,0 +1,25 @@
+{
+  "doi": "10.5281/zenodo.6392555",
+  "title": "other",
+  "creators": [
+    "Vesakoski, Outi"
+  ],
+  "year": "2022",
+  "license": "cc-by-4.0",
+  "download_urls": [
+    "https://zenodo.org/records/6392555/files/cldf-datasets/uratyp-v1.1.zip/content"
+  ],
+  "keywords": [
+    "cldf:StructureDataset",
+    "linguistics"
+  ],
+  "communities": [],
+  "github_repos": {
+    "org": "cldf-datasets",
+    "name": "uratyp",
+    "tag": "v1.1"
+  },
+  "closed_access": false,
+  "version": "v1.1",
+  "concept_doi": "10.5281/zenodo.5236365"
+}