Skip to content

Commit

Permalink
Merge branch 'spheroscope' of github.com:ausgerechnet/cwb-cads into s…
Browse files Browse the repository at this point in the history
…pheroscope
  • Loading branch information
SpitfireX committed Dec 19, 2024
2 parents 1ed0dd6 + ee2a599 commit 5d3cc7c
Show file tree
Hide file tree
Showing 20 changed files with 963 additions and 261 deletions.
5 changes: 1 addition & 4 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,4 @@ ENV/
*.Rproj
.RData
.Rhistory

# Config
# ------
cfg.py
frontend/.vite/
3 changes: 2 additions & 1 deletion cads/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,14 +98,15 @@ def docs():
return redirect(request.base_url + "docs")

# register blueprints
from . import collocation, corpus, keyword, query, semantic_map, users
from . import collocation, corpus, keyword, query, semantic_map, users, library

app.register_blueprint(users.bp)
app.register_blueprint(corpus.bp)
app.register_blueprint(query.bp)
app.register_blueprint(collocation.bp)
app.register_blueprint(semantic_map.bp)
app.register_blueprint(keyword.bp)
app.register_blueprint(library.bp)

from . import mmda
app.register_blueprint(mmda.bp)
Expand Down
303 changes: 302 additions & 1 deletion cads/database.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
#!/usr/bin/python3
# -*- coding: utf-8 -*-

import os
import json
import re

from datetime import datetime

from ccc import Corpus as Crps
Expand Down Expand Up @@ -159,7 +163,7 @@ def nr_tokens(self):

def ccc(self):
return Crps(corpus_name=self.cwb_id,
lib_dir=current_app.config['CCC_LIB_DIR'],
lib_dir=os.path.join(current_app.config['CCC_LIB_DIR'], f"corpus_{self.id}"),
cqp_bin=current_app.config['CCC_CQP_BIN'],
registry_dir=current_app.config['CCC_REGISTRY_DIR'],
data_dir=current_app.config['CCC_DATA_DIR'],
Expand Down Expand Up @@ -278,6 +282,15 @@ class SegmentationSpanAnnotation(db.Model):

# QUERIES #
###########
def parse_macro_call_arguments(argstring):
"""Determines the number of arguments supplied to a macro call"""

if argstring:
return len(argstring.split(","))
else:
return 0


class Query(db.Model):
"""Query: executed in CQP and dumped to disk
Expand All @@ -286,13 +299,18 @@ class Query(db.Model):
__table_args__ = ({'sqlite_autoincrement': True})

id = db.Column(db.Integer, primary_key=True)
name = db.Column(db.Unicode(255))
version = db.Column(db.Unicode(255))
type = db.Column(db.Unicode(10))
modified = db.Column(db.DateTime, default=datetime.utcnow)

corpus_id = db.Column(db.Integer, db.ForeignKey('corpus.id', ondelete='CASCADE'))
subcorpus_id = db.Column(db.Integer, db.ForeignKey('sub_corpus.id', ondelete='CASCADE')) # run on previously defined subcorpus
zero_matches = db.Column(db.Boolean, default=False)
error = db.Column(db.Boolean, default=False)

comment = db.Column(db.Unicode)

filter_sequence = db.Column(db.Unicode)

match_strategy = db.Column(db.Unicode, default='longest')
Expand All @@ -308,6 +326,79 @@ class Query(db.Model):
concordances = db.relationship('Concordance', backref='_query', passive_deletes=True, cascade='all, delete')
cotexts = db.relationship('Cotext', backref='_query', passive_deletes=True, cascade='all, delete')

wordlist_calls = db.relationship("WordListCall", passive_deletes=True, cascade='all, delete')
macro_calls = db.relationship("MacroCall", passive_deletes=True, cascade='all, delete')

__table_args__ = (
db.UniqueConstraint("name", "version"), # alternative primary key for version history
)

__mapper_args__ = {
"polymorphic_on": "type",
"polymorphic_identity": "query",
}

def __init__(self, **kwargs):
super(Query, self).__init__(**kwargs)
db.session.add(self)

## perform dependency resolution and fail if resolution is impossible

## word list extraction

wl_matches = re.finditer(r"\$([a-zA-Z_][a-zA-Z0-9_\-]*)", self.cqp_query)
wl_calls = {wl[1] for wl in wl_matches}
if wl_calls:
current_app.logger.debug(f"\tcontains word lists '{wl_calls}'")

# resolve word lists and save relationship for later mangling before execution
for identifier in wl_calls:
wl = WordList.query \
.filter(WordList.corpus_id == self.corpus_id) \
.filter(WordList.name == identifier) \
.order_by(WordList.version.desc()) \
.first()

if not wl:
db.session.delete(self)
raise Exception(f"undefined word list {identifier}")
else:
call = WordListCall(
query_id=self.id,
wordlist_id=wl.id
)
db.session.add(call)

db.session.commit()

## macro extraction

macro_matches = re.finditer(r"/([a-zA-Z_][a-zA-Z0-9_\-]*)\[(.*?)\]", self.cqp_query)
macro_calls = {(m[1], parse_macro_call_arguments(m[2])) for m in macro_matches}
if macro_calls:
current_app.logger.debug(f"\tcontains macros '{macro_calls}'")

# resolve macros and save relationship for later mangling before execution
for identifier, valency in macro_calls:
macro = Macro.query \
.filter(Macro.corpus_id == self.corpus_id) \
.filter(Macro.name == identifier) \
.filter(Macro.valency == valency) \
.order_by(Macro.version.desc()) \
.first()

if not macro:
db.session.delete(self)
raise Exception(f"undefined macro {identifier} with valency {valency}")
else:
call = MacroCall(
query_id=self.id,
macro_id=macro.id
)
db.session.add(call)

db.session.commit()

@property
def number_matches(self):
sql_query = f"SELECT count(*) FROM matches WHERE query_id == {self.id};"
Expand Down Expand Up @@ -885,6 +976,216 @@ class KeywordItemScore(db.Model):
score = db.Column(db.Float)


# WORD LISTS AND MACROS #
#########################
class WordList(db.Model):

__table_args__ = {'sqlite_autoincrement': True}

# __table_args__ = (
# db.UniqueConstraint('name', 'corpus_id', name='unique_name_corpus'),
# )

id = db.Column(db.Integer, primary_key=True)

name = db.Column(db.Unicode(255), nullable=False)
version = db.Column(db.Integer, nullable=False)
corpus_id = db.Column(db.Integer, db.ForeignKey('corpus.id'), nullable=False)
# user_id = db.Column(db.Integer, db.ForeignKey('user.id'), nullable=False)

modified = db.Column(db.DateTime, nullable=False, default=datetime.now())

words = db.relationship("WordListWords", backref="word_list", cascade="all, delete")
# p_att = db.Column(db.Unicode(50), nullable=False)

comment = db.Column(db.Unicode)

__table_args__ = (
db.UniqueConstraint("name", "version", "corpus_id"), # alternative primary key for version history
)

@property
def path(self):
return os.path.join(current_app.config['CCC_LIB_DIR'], f"corpus_{self.corpus_id}", "wordlists", f"{self.name}__v{self.version}.txt")

def write(self):
os.makedirs(os.path.dirname(self.path), exist_ok=True)
with open(self.path, "wt") as f:
f.write("\n".join([w. word for w in self.words]))


class WordListWords(db.Model):

__table_args__ = {'sqlite_autoincrement': True}

id = db.Column(db.Integer(), primary_key=True)
wordlist_id = db.Column(db.Integer, db.ForeignKey('word_list.id', ondelete='CASCADE'))

word = db.Column(db.Unicode(), nullable=True)

def __str__(self):
return self.word

class WordListCall(db.Model):

__table_args__ = {'sqlite_autoincrement': True}

query_id = db.Column(db.Integer, db.ForeignKey("query.id", ondelete="CASCADE"), primary_key=True)
wordlist_id = db.Column(db.Integer, db.ForeignKey('word_list.id', ondelete="CASCADE"), primary_key=True)

wordlist = db.relationship("WordList")


class Macro(db.Model):

__table_args__ = {'sqlite_autoincrement': True}

id = db.Column(db.Integer, primary_key=True)

name = db.Column(db.Unicode(255), nullable=False)
valency = db.Column(db.Integer, nullable=False)
argument_names = db.Column(db.Unicode(255), nullable=True)
version = db.Column(db.Integer, nullable=False)
corpus_id = db.Column(db.Integer, db.ForeignKey('corpus.id'), nullable=False)
# user_id = db.Column(db.Integer, db.ForeignKey('user.id'), nullable=False)

modified = db.Column(db.DateTime, nullable=False, default=datetime.now())

body = db.Column(db.Unicode)

comment = db.Column(db.Unicode)

nested_wordlist = db.relationship("NestedWordList", passive_deletes=True, cascade='all, delete')
nested_macro = db.relationship("NestedMacro", passive_deletes=True, \
cascade='all, delete', primaryjoin="Macro.id == NestedMacro.macro_id")

__table_args__ = (
db.UniqueConstraint("name", "valency", "version", "corpus_id"), # alternative primary key for version history
)

def __init__(self, **kwargs):
super(Macro, self).__init__(**kwargs)
db.session.add(self)

## perform dependency resolution and fail init if resolution is impossible
# TODO: refactor and unify this code with Query __init__

## handle nested word lists
nested_wordlists = {m[1] for m in re.finditer(r"\$([a-zA-Z_][a-zA-Z0-9_\-]*)", self.body)}
if nested_wordlists:
current_app.logger.debug(f"\tcontains nested word list calls: '{nested_wordlists}'")

for identifier in nested_wordlists:
# check if word list with this identifier is in db
# and get latest version
wl = WordList.query \
.filter(WordList.corpus_id == self.corpus_id) \
.filter(WordList.name == identifier) \
.order_by(WordList.version.desc()) \
.first()

if not wl:
# abort and delete self if there is no candidate
db.session.delete(self)
raise Exception(f"undefined word list {identifier}")
else:
# mangle and replace identifiers in the macro definition
pattern = fr"\${wl.name}"
repl = fr"${wl.name}__v{wl.version}"
self.body = re.sub(pattern, repl, self.body, flags=re.S)

# save the dependency in the db
record = NestedWordList(
macro_id=self.id,
wordlist_id=wl.id
)

db.session.add(record)

## handle nested macros
macro_matches = re.finditer(r"/([a-zA-Z_][a-zA-Z0-9_\-]*)\[(.*?)\]", self.body)
nested_macros = {(m[1], parse_macro_call_arguments(m[2])) for m in macro_matches}
if nested_macros:
current_app.logger.debug(f"\tcontains nested macro calls: '{nested_macros}'")

for identifier, valency in nested_macros:
# check if macro with this identifier and valency is in db
# and get latest version
nm = Macro.query \
.filter(Macro.corpus_id == self.corpus_id) \
.filter(Macro.name == identifier) \
.filter(Macro.valency == valency) \
.order_by(Macro.version.desc()) \
.first()

if not nm:
# abort and delete self if no candidate exists
db.session.delete(self)
raise Exception(f"undefined nested macro call {identifier} with valency {valency}")
else:
# mangle and replace identifiers in the macro definition
pattern = fr"/{nm.name}(\[{', ?'.join(nm.valency * [r'[^,\s]+?'])}\])"
repl = fr"/{nm.name}__{nm.valency}__v{nm.version}\1"
self.body = re.sub(pattern, repl, self.body, flags=re.S)

# save the dependency in the db
record = NestedMacro(
macro_id=self.id,
nested_id=nm.id
)

db.session.add(record)

@property
def canonical_name(self):
return f"{self.name}__{self.valency}__v{self.version}"

@property
def path(self):
return os.path.join(current_app.config['CCC_LIB_DIR'], f"corpus_{self.corpus_id}", \
"macros", self.canonical_name + ".txt")

def write(self):
os.makedirs(os.path.dirname(self.path), exist_ok=True)
with open(self.path, "wt") as f:

if self.argument_names:
names = json.loads(self.argument_names)
assert len(names) == self.valency
argstring = " ".join(f"${i}={n}" for i, n in enumerate(names))
f.write(f"MACRO {self.canonical_name}({argstring})\n")
else:
f.write(f"MACRO {self.canonical_name}({self.valency})\n")
f.write(self.body)
f.write(";\n")


class NestedMacro(db.Model):

__table_args__ = {'sqlite_autoincrement': True}

macro_id = db.Column(db.Integer, db.ForeignKey('macro.id', ondelete="CASCADE"), primary_key=True)
nested_id = db.Column(db.Integer, db.ForeignKey('macro.id', ondelete="CASCADE"), primary_key=True)


class NestedWordList(db.Model):

__table_args__ = {'sqlite_autoincrement': True}

macro_id = db.Column(db.Integer, db.ForeignKey('macro.id', ondelete="CASCADE"), primary_key=True)
wordlist_id = db.Column(db.Integer, db.ForeignKey('word_list.id', ondelete="CASCADE"), primary_key=True)


class MacroCall(db.Model):

__table_args__ = {'sqlite_autoincrement': True}

query_id = db.Column(db.Integer, db.ForeignKey("query.id", ondelete="CASCADE"), primary_key=True)
macro_id = db.Column(db.Integer, db.ForeignKey('macro.id', ondelete="CASCADE"), primary_key=True)

macro = db.relationship("Macro")


# CLI #
#######
@bp.cli.command('init')
Expand Down
Loading

0 comments on commit 5d3cc7c

Please sign in to comment.