diff --git a/.gitignore b/.gitignore index 19c27f53..ca269696 100644 --- a/.gitignore +++ b/.gitignore @@ -139,7 +139,4 @@ ENV/ *.Rproj .RData .Rhistory - -# Config -# ------ -cfg.py \ No newline at end of file +frontend/.vite/ diff --git a/cads/__init__.py b/cads/__init__.py index 7a5118e8..ed56c047 100644 --- a/cads/__init__.py +++ b/cads/__init__.py @@ -98,7 +98,7 @@ def docs(): return redirect(request.base_url + "docs") # register blueprints - from . import collocation, corpus, keyword, query, semantic_map, users + from . import collocation, corpus, keyword, query, semantic_map, users, library app.register_blueprint(users.bp) app.register_blueprint(corpus.bp) @@ -106,6 +106,7 @@ def docs(): app.register_blueprint(collocation.bp) app.register_blueprint(semantic_map.bp) app.register_blueprint(keyword.bp) + app.register_blueprint(library.bp) from . import mmda app.register_blueprint(mmda.bp) diff --git a/cads/database.py b/cads/database.py index b9dfc054..76f0e414 100644 --- a/cads/database.py +++ b/cads/database.py @@ -1,6 +1,10 @@ #!/usr/bin/python3 # -*- coding: utf-8 -*- +import os +import json +import re + from datetime import datetime from ccc import Corpus as Crps @@ -159,7 +163,7 @@ def nr_tokens(self): def ccc(self): return Crps(corpus_name=self.cwb_id, - lib_dir=current_app.config['CCC_LIB_DIR'], + lib_dir=os.path.join(current_app.config['CCC_LIB_DIR'], f"corpus_{self.id}"), cqp_bin=current_app.config['CCC_CQP_BIN'], registry_dir=current_app.config['CCC_REGISTRY_DIR'], data_dir=current_app.config['CCC_DATA_DIR'], @@ -278,6 +282,15 @@ class SegmentationSpanAnnotation(db.Model): # QUERIES # ########### +def parse_macro_call_arguments(argstring): + """Determines the number of arguments supplied to a macro call""" + + if argstring: + return len(argstring.split(",")) + else: + return 0 + + class Query(db.Model): """Query: executed in CQP and dumped to disk @@ -286,6 +299,9 @@ class Query(db.Model): __table_args__ = ({'sqlite_autoincrement': True}) id = db.Column(db.Integer, primary_key=True) + name = db.Column(db.Unicode(255)) + version = db.Column(db.Unicode(255)) + type = db.Column(db.Unicode(10)) modified = db.Column(db.DateTime, default=datetime.utcnow) corpus_id = db.Column(db.Integer, db.ForeignKey('corpus.id', ondelete='CASCADE')) @@ -293,6 +309,8 @@ class Query(db.Model): zero_matches = db.Column(db.Boolean, default=False) error = db.Column(db.Boolean, default=False) + comment = db.Column(db.Unicode) + filter_sequence = db.Column(db.Unicode) match_strategy = db.Column(db.Unicode, default='longest') @@ -308,6 +326,79 @@ class Query(db.Model): concordances = db.relationship('Concordance', backref='_query', passive_deletes=True, cascade='all, delete') cotexts = db.relationship('Cotext', backref='_query', passive_deletes=True, cascade='all, delete') + wordlist_calls = db.relationship("WordListCall", passive_deletes=True, cascade='all, delete') + macro_calls = db.relationship("MacroCall", passive_deletes=True, cascade='all, delete') + + __table_args__ = ( + db.UniqueConstraint("name", "version"), # alternative primary key for version history + ) + + __mapper_args__ = { + "polymorphic_on": "type", + "polymorphic_identity": "query", + } + + def __init__(self, **kwargs): + super(Query, self).__init__(**kwargs) + db.session.add(self) + + ## perform dependency resolution and fail if resolution is impossible + + ## word list extraction + + wl_matches = re.finditer(r"\$([a-zA-Z_][a-zA-Z0-9_\-]*)", self.cqp_query) + wl_calls = {wl[1] for wl in wl_matches} + if wl_calls: + current_app.logger.debug(f"\tcontains word lists '{wl_calls}'") + + # resolve word lists and save relationship for later mangling before execution + for identifier in wl_calls: + wl = WordList.query \ + .filter(WordList.corpus_id == self.corpus_id) \ + .filter(WordList.name == identifier) \ + .order_by(WordList.version.desc()) \ + .first() + + if not wl: + db.session.delete(self) + raise Exception(f"undefined word list {identifier}") + else: + call = WordListCall( + query_id=self.id, + wordlist_id=wl.id + ) + db.session.add(call) + + db.session.commit() + + ## macro extraction + + macro_matches = re.finditer(r"/([a-zA-Z_][a-zA-Z0-9_\-]*)\[(.*?)\]", self.cqp_query) + macro_calls = {(m[1], parse_macro_call_arguments(m[2])) for m in macro_matches} + if macro_calls: + current_app.logger.debug(f"\tcontains macros '{macro_calls}'") + + # resolve macros and save relationship for later mangling before execution + for identifier, valency in macro_calls: + macro = Macro.query \ + .filter(Macro.corpus_id == self.corpus_id) \ + .filter(Macro.name == identifier) \ + .filter(Macro.valency == valency) \ + .order_by(Macro.version.desc()) \ + .first() + + if not macro: + db.session.delete(self) + raise Exception(f"undefined macro {identifier} with valency {valency}") + else: + call = MacroCall( + query_id=self.id, + macro_id=macro.id + ) + db.session.add(call) + + db.session.commit() + @property def number_matches(self): sql_query = f"SELECT count(*) FROM matches WHERE query_id == {self.id};" @@ -885,6 +976,216 @@ class KeywordItemScore(db.Model): score = db.Column(db.Float) +# WORD LISTS AND MACROS # +######################### +class WordList(db.Model): + + __table_args__ = {'sqlite_autoincrement': True} + + # __table_args__ = ( + # db.UniqueConstraint('name', 'corpus_id', name='unique_name_corpus'), + # ) + + id = db.Column(db.Integer, primary_key=True) + + name = db.Column(db.Unicode(255), nullable=False) + version = db.Column(db.Integer, nullable=False) + corpus_id = db.Column(db.Integer, db.ForeignKey('corpus.id'), nullable=False) + # user_id = db.Column(db.Integer, db.ForeignKey('user.id'), nullable=False) + + modified = db.Column(db.DateTime, nullable=False, default=datetime.now()) + + words = db.relationship("WordListWords", backref="word_list", cascade="all, delete") + # p_att = db.Column(db.Unicode(50), nullable=False) + + comment = db.Column(db.Unicode) + + __table_args__ = ( + db.UniqueConstraint("name", "version", "corpus_id"), # alternative primary key for version history + ) + + @property + def path(self): + return os.path.join(current_app.config['CCC_LIB_DIR'], f"corpus_{self.corpus_id}", "wordlists", f"{self.name}__v{self.version}.txt") + + def write(self): + os.makedirs(os.path.dirname(self.path), exist_ok=True) + with open(self.path, "wt") as f: + f.write("\n".join([w. word for w in self.words])) + + +class WordListWords(db.Model): + + __table_args__ = {'sqlite_autoincrement': True} + + id = db.Column(db.Integer(), primary_key=True) + wordlist_id = db.Column(db.Integer, db.ForeignKey('word_list.id', ondelete='CASCADE')) + + word = db.Column(db.Unicode(), nullable=True) + + def __str__(self): + return self.word + +class WordListCall(db.Model): + + __table_args__ = {'sqlite_autoincrement': True} + + query_id = db.Column(db.Integer, db.ForeignKey("query.id", ondelete="CASCADE"), primary_key=True) + wordlist_id = db.Column(db.Integer, db.ForeignKey('word_list.id', ondelete="CASCADE"), primary_key=True) + + wordlist = db.relationship("WordList") + + +class Macro(db.Model): + + __table_args__ = {'sqlite_autoincrement': True} + + id = db.Column(db.Integer, primary_key=True) + + name = db.Column(db.Unicode(255), nullable=False) + valency = db.Column(db.Integer, nullable=False) + argument_names = db.Column(db.Unicode(255), nullable=True) + version = db.Column(db.Integer, nullable=False) + corpus_id = db.Column(db.Integer, db.ForeignKey('corpus.id'), nullable=False) + # user_id = db.Column(db.Integer, db.ForeignKey('user.id'), nullable=False) + + modified = db.Column(db.DateTime, nullable=False, default=datetime.now()) + + body = db.Column(db.Unicode) + + comment = db.Column(db.Unicode) + + nested_wordlist = db.relationship("NestedWordList", passive_deletes=True, cascade='all, delete') + nested_macro = db.relationship("NestedMacro", passive_deletes=True, \ + cascade='all, delete', primaryjoin="Macro.id == NestedMacro.macro_id") + + __table_args__ = ( + db.UniqueConstraint("name", "valency", "version", "corpus_id"), # alternative primary key for version history + ) + + def __init__(self, **kwargs): + super(Macro, self).__init__(**kwargs) + db.session.add(self) + + ## perform dependency resolution and fail init if resolution is impossible + # TODO: refactor and unify this code with Query __init__ + + ## handle nested word lists + nested_wordlists = {m[1] for m in re.finditer(r"\$([a-zA-Z_][a-zA-Z0-9_\-]*)", self.body)} + if nested_wordlists: + current_app.logger.debug(f"\tcontains nested word list calls: '{nested_wordlists}'") + + for identifier in nested_wordlists: + # check if word list with this identifier is in db + # and get latest version + wl = WordList.query \ + .filter(WordList.corpus_id == self.corpus_id) \ + .filter(WordList.name == identifier) \ + .order_by(WordList.version.desc()) \ + .first() + + if not wl: + # abort and delete self if there is no candidate + db.session.delete(self) + raise Exception(f"undefined word list {identifier}") + else: + # mangle and replace identifiers in the macro definition + pattern = fr"\${wl.name}" + repl = fr"${wl.name}__v{wl.version}" + self.body = re.sub(pattern, repl, self.body, flags=re.S) + + # save the dependency in the db + record = NestedWordList( + macro_id=self.id, + wordlist_id=wl.id + ) + + db.session.add(record) + + ## handle nested macros + macro_matches = re.finditer(r"/([a-zA-Z_][a-zA-Z0-9_\-]*)\[(.*?)\]", self.body) + nested_macros = {(m[1], parse_macro_call_arguments(m[2])) for m in macro_matches} + if nested_macros: + current_app.logger.debug(f"\tcontains nested macro calls: '{nested_macros}'") + + for identifier, valency in nested_macros: + # check if macro with this identifier and valency is in db + # and get latest version + nm = Macro.query \ + .filter(Macro.corpus_id == self.corpus_id) \ + .filter(Macro.name == identifier) \ + .filter(Macro.valency == valency) \ + .order_by(Macro.version.desc()) \ + .first() + + if not nm: + # abort and delete self if no candidate exists + db.session.delete(self) + raise Exception(f"undefined nested macro call {identifier} with valency {valency}") + else: + # mangle and replace identifiers in the macro definition + pattern = fr"/{nm.name}(\[{', ?'.join(nm.valency * [r'[^,\s]+?'])}\])" + repl = fr"/{nm.name}__{nm.valency}__v{nm.version}\1" + self.body = re.sub(pattern, repl, self.body, flags=re.S) + + # save the dependency in the db + record = NestedMacro( + macro_id=self.id, + nested_id=nm.id + ) + + db.session.add(record) + + @property + def canonical_name(self): + return f"{self.name}__{self.valency}__v{self.version}" + + @property + def path(self): + return os.path.join(current_app.config['CCC_LIB_DIR'], f"corpus_{self.corpus_id}", \ + "macros", self.canonical_name + ".txt") + + def write(self): + os.makedirs(os.path.dirname(self.path), exist_ok=True) + with open(self.path, "wt") as f: + + if self.argument_names: + names = json.loads(self.argument_names) + assert len(names) == self.valency + argstring = " ".join(f"${i}={n}" for i, n in enumerate(names)) + f.write(f"MACRO {self.canonical_name}({argstring})\n") + else: + f.write(f"MACRO {self.canonical_name}({self.valency})\n") + f.write(self.body) + f.write(";\n") + + +class NestedMacro(db.Model): + + __table_args__ = {'sqlite_autoincrement': True} + + macro_id = db.Column(db.Integer, db.ForeignKey('macro.id', ondelete="CASCADE"), primary_key=True) + nested_id = db.Column(db.Integer, db.ForeignKey('macro.id', ondelete="CASCADE"), primary_key=True) + + +class NestedWordList(db.Model): + + __table_args__ = {'sqlite_autoincrement': True} + + macro_id = db.Column(db.Integer, db.ForeignKey('macro.id', ondelete="CASCADE"), primary_key=True) + wordlist_id = db.Column(db.Integer, db.ForeignKey('word_list.id', ondelete="CASCADE"), primary_key=True) + + +class MacroCall(db.Model): + + __table_args__ = {'sqlite_autoincrement': True} + + query_id = db.Column(db.Integer, db.ForeignKey("query.id", ondelete="CASCADE"), primary_key=True) + macro_id = db.Column(db.Integer, db.ForeignKey('macro.id', ondelete="CASCADE"), primary_key=True) + + macro = db.relationship("Macro") + + # CLI # ####### @bp.cli.command('init') diff --git a/cads/library.py b/cads/library.py new file mode 100644 index 00000000..d91c6c0c --- /dev/null +++ b/cads/library.py @@ -0,0 +1,235 @@ +#! /usr/bin/env python +# -*- coding: utf-8 -*- + +import json +import os +from glob import glob +import re + +import click +from flask import current_app as app +from apiflask import APIBlueprint, Schema, abort +from apiflask.fields import Boolean, Float, Integer, List, Nested, String +from .database import Macro, NestedMacro, NestedWordList, WordList, WordListWords, Corpus, parse_macro_call_arguments +from .users import auth +from . import db + +from .spheroscope.slot_query import import_slot_query + + +bp = APIBlueprint('library', __name__, url_prefix='/library', cli_group='library') + + +argstring_simple = re.compile(r"\d|10") +argstring_named = re.compile(r"\d=([a-zA-Z_][a-zA-Z0-9_\-]*)") + +def parse_macro_arguments(argstring): + """Parses a macro definition to determine number and name of arguments""" + + match = argstring_simple.match(argstring) + if match: + return int(argstring), None + else: + args = argstring_named.findall(argstring) + return len(args), args if args else None + + +def import_macro(name, valency, argument_names, body, corpus_id): + + try: + macro = Macro( + name=name, + valency=valency, + argument_names=json.dumps(argument_names) if argument_names else None, + version=1, + corpus_id=corpus_id, + body=body, + comment='imported via CLI' + ) + db.session.add(macro) + except Exception as e: + app.logger.error(f"could not init Macro: {e}") + return + finally: + db.session.commit() + + # write macro to disk so it can be loaded by CQP + macro.write() + + +def import_wordlist(path, corpus_id): + + name = path.split('/')[-1].split('.')[0] + with open(path, "rt") as f: + words = f.read().strip().split("\n") + + wordlist = WordList( + name=name, + version=1, + corpus_id=corpus_id, + comment='imported via CLI' + ) + db.session.add(wordlist) + db.session.commit() + + for word in words: + db.session.add(WordListWords(wordlist_id=wordlist.id, word=word)) + db.session.commit() + + wordlist.write() + + +def import_library(lib_dir, corpus_id, username): + + """Imports a library of macros, word lists and slot queries for a + given corpus from a given directory""" + + paths_macros = glob(os.path.join(lib_dir, "macros", "*.txt")) + paths_wordlists = glob(os.path.join(lib_dir, "wordlists", "*.txt")) + paths_queries = glob(os.path.join(lib_dir, "queries", "*.cqpy")) + + for path in paths_wordlists: + import_wordlist(path, corpus_id) + + for path in paths_macros: + # load the file + with open(path, "rt") as f: + file = f.read().strip() + + # break up into individual macros and import + for match in re.finditer(r"MACRO ([a-zA-Z_][a-zA-Z0-9_\-]*)\((.*?)\)(.+?);", file, flags=re.S): + name = match[1] + arguments = match[2] + body = match[3] + + valency, argument_names = parse_macro_arguments(arguments) + + # process body to remove indentation + body = body.strip() + body = "\n".join(line.strip() for line in body.split("\n")) + body += "\n" + + if argument_names: + app.logger.debug(f"Importing macro '{name}' with arguments {argument_names} from file '{path}'") + else: + app.logger.debug(f"Importing macro '{name}' with {valency} arguments from file '{path}'") + + import_macro(name, valency, argument_names, body, corpus_id) + + for path in paths_queries: + import_slot_query(path, corpus_id) + + +################ +# API schemata # +################ + +class MacroOut(Schema): + + id = Integer() + modified = String() + corpus_id = Integer() + name = String() + version = Integer() + comment = String() + macro = String() + + +class WordListOut(Schema): + + id = Integer() + modified = String() + corpus_id = Integer() + name = String() + version = Integer() + comment = String() + words = List(String()) + +################# +# API endpoints # +################# + +@bp.get("//macros/") +@bp.output(MacroOut) +@bp.auth_required(auth) +def get_macro(corpus_id, id): + + """Gets a single macro for a specified corpus""" + + corpus = db.get_or_404(Corpus, corpus_id) + + try: + macro = Macro.query \ + .filter(Macro.corpus_id == corpus.id) \ + .filter(Macro.id == id) \ + .one() + + return MacroOut().dump(macro), 200 + except: + return abort(404, message=f"Macro with id {id} does not exist for corpus with id {corpus_id} in database") + + +@bp.get("//macros") +@bp.output(MacroOut(many=True)) +@bp.auth_required(auth) +def get_macros(corpus_id): + + """Gets all macros for a specified corpus""" + + corpus = db.get_or_404(Corpus, corpus_id) + + macros = Macro.query \ + .filter(Macro.corpus_id == corpus.id) \ + .all() + + return [MacroOut().dump(m) for m in macros], 200 + + +@bp.get("//wordlists/") +@bp.output(WordListOut) +@bp.auth_required(auth) +def get_word_list(corpus_id, id): + + """Gets a single wordlist for a specified corpus""" + + corpus = db.get_or_404(Corpus, corpus_id) + + try: + wl = WordList.query \ + .filter(WordList.corpus_id == corpus.id) \ + .filter(WordList.id == id) \ + .one() + + return WordListOut().dump(wl), 200 + except: + return abort(404, message=f"Word list with id {id} does not exist for corpus with id {corpus_id} in database") + + +@bp.get("//wordlists") +@bp.output(WordListOut(many=True)) +@bp.auth_required(auth) +def get_word_lists(corpus_id): + + """Gets all wordlists for a specified corpus""" + + corpus = db.get_or_404(Corpus, corpus_id) + + wls = WordList.query \ + .filter(WordList.corpus_id == corpus.id) \ + .all() + + return [WordListOut().dump(wl) for wl in wls], 200 + + +################ +# CLI commands # +################ + +@bp.cli.command('import-library') +@click.option('--corpus_id', default=1) +@click.option('--lib_dir', default='tests/library/') +def import_library_cmd(corpus_id, lib_dir): + + username = 'admin' + + import_library(lib_dir, corpus_id, username) diff --git a/cads/query.py b/cads/query.py index 7561ffda..b2d967c6 100644 --- a/cads/query.py +++ b/cads/query.py @@ -1,6 +1,8 @@ #!/usr/bin/python3 # -*- coding: utf-8 -*- +import re + from random import randint from apiflask import APIBlueprint, Schema, abort @@ -45,9 +47,26 @@ def ccc_query(query, return_df=True): else: corpus = query.corpus.ccc() + # process query to mangle library identifiers + mangled_query = query.cqp_query + + # apply macro mangling + for mc in query.macro_calls: + m = mc.macro + pattern = fr"/{m.name}(\[{', ?'.join(m.valency * [r'[^,\s]+?'])}\])" + repl = fr"/{m.name}__{m.valency}__v{m.version}\1" + mangled_query = re.sub(pattern, repl, mangled_query) + + # apply wordlist mangling + for wlc in query.wordlist_calls: + wl = wlc.wordlist + pattern = fr"\${wl.name}" + repl = fr"${wl.name}__v{wl.version}" + mangled_query = re.sub(pattern, repl, mangled_query) + # query corpus current_app.logger.debug('ccc_query :: querying') - matches = corpus.query(cqp_query=query.cqp_query, + matches = corpus.query(cqp_query=mangled_query, context_break=query.s, match_strategy=query.match_strategy, propagate_error=True) @@ -56,7 +75,7 @@ def ccc_query(query, return_df=True): current_app.logger.error(f"ccc_query :: error: '{matches}'") query.error = True db.session.commit() - return DataFrame() + return DataFrame() if len(matches.df) == 0: # no matches current_app.logger.debug("0 matches") diff --git a/cads/spheroscope/__init__.py b/cads/spheroscope/__init__.py index ad8a52cf..820c0e6b 100644 --- a/cads/spheroscope/__init__.py +++ b/cads/spheroscope/__init__.py @@ -4,10 +4,12 @@ from apiflask import APIBlueprint -from . import library, slot_query +from .. import library + +from . import slot_query, query_history bp = APIBlueprint('spheroscope', __name__, url_prefix='/spheroscope') bp.register_blueprint(slot_query.bp) -bp.register_blueprint(library.bp) +bp.register_blueprint(query_history.bp) diff --git a/cads/spheroscope/database.py b/cads/spheroscope/database.py index 226cfdc5..f563fdbe 100644 --- a/cads/spheroscope/database.py +++ b/cads/spheroscope/database.py @@ -9,114 +9,34 @@ from flask import current_app from .. import db -from ..database import Corpus +from ..database import Query -class WordList(db.Model): - - __table_args__ = {'sqlite_autoincrement': True} - - # __table_args__ = ( - # db.UniqueConstraint('name', 'corpus_id', name='unique_name_corpus'), - # ) - - id = db.Column(db.Integer, primary_key=True) - modified = db.Column(db.DateTime, nullable=False, default=datetime.now()) - - # corpus_id = db.Column(db.Integer, db.ForeignKey('corpus.id'), nullable=False) - # user_id = db.Column(db.Integer, db.ForeignKey('user.id')) - - name = db.Column(db.Unicode(255), nullable=False) - words = db.relationship("WordListWords", backref="word_list", cascade="all, delete") - # p_att = db.Column(db.Unicode(50), nullable=False) - - comment = db.Column(db.Unicode) - - @property - def path(self): - return os.path.join(current_app.config['CCC_LIB_DIR'], "wordlists", self.name + ".txt") - - def write(self): - os.makedirs(os.path.dirname(self.path), exist_ok=True) - with open(self.path, "wt") as f: - f.write("\n".join([w. word for w in self.words])) - - -class WordListWords(db.Model): - - __table_args__ = {'sqlite_autoincrement': True} - - id = db.Column(db.Integer(), primary_key=True) - wordlist_id = db.Column(db.Integer, db.ForeignKey('word_list.id', ondelete='CASCADE')) - - word = db.Column(db.Unicode(), nullable=True) - - -class Macro(db.Model): - - __table_args__ = {'sqlite_autoincrement': True} - - # __table_args__ = ( - # db.UniqueConstraint('name', 'corpus_id', name='unique_name_corpus'), - # ) - - id = db.Column(db.Integer, primary_key=True) - modified = db.Column(db.DateTime, nullable=False, default=datetime.now()) - - # corpus_id = db.Column(db.Integer, db.ForeignKey('corpus.id'), nullable=False) - # user_id = db.Column(db.Integer, db.ForeignKey('user.id')) - - name = db.Column(db.Unicode(255), nullable=False) - macro = db.Column(db.Unicode) - - comment = db.Column(db.Unicode) - - @property - def path(self): - return os.path.join(current_app.config['CCC_LIB_DIR'], "macros", self.name + ".txt") - - def write(self): - os.makedirs(os.path.dirname(self.path), exist_ok=True) - with open(self.path, "wt") as f: - f.write(self.macro) - - -class SlotQuery(db.Model): - - # __table_args__ = ( - # db.UniqueConstraint('name', 'corpus_id', name='unique_name_corpus'), - # ) - - id = db.Column(db.Integer, primary_key=True) - modified = db.Column(db.DateTime, nullable=False, default=datetime.now()) - - corpus_id = db.Column(db.Integer, db.ForeignKey('corpus.id'), nullable=False) - # user_id = db.Column(db.Integer, db.ForeignKey('user.id'), nullable=False) - - name = db.Column(db.Unicode(255), nullable=False) +class SlotQuery(Query): _corrections = db.Column(db.Unicode) _slots = db.Column(db.Unicode) - cqp_query = db.Column(db.Unicode) - match_strategy = db.Column(db.Unicode, default='longest') - - comment = db.Column(db.Unicode) - @property - def corpus(self): - return db.get_or_404(Corpus, self.corpus_id) + __mapper_args__ = { + "polymorphic_identity": "slot_query", + } @property def path(self): - return os.path.join(current_app.config['CCC_LIB_DIR'], "queries", self.name + ".cqpy") + identifier = self.name if self.name else str(self.corpus_id) + return os.path.join(current_app.config['CCC_LIB_DIR'], f"corpus_{self.corpus_id}", "queries", identifier + ".cqpy") @property def slots(self): - return json.loads(self._slots) + if self._slots: + return json.loads(self._slots) + return [] @property def corrections(self): - return json.loads(self._corrections) + if self._corrections: + return json.loads(self._corrections) + return [] def serialize(self): @@ -135,3 +55,39 @@ def serialize(self): def write(self): os.makedirs(os.path.dirname(self.path), exist_ok=True) cqpy_dump(self.serialize(), self.path) + + +class QueryHistory(db.Model): + + __table_args__ = {'sqlite_autoincrement': True} + + id = db.Column(db.Integer, primary_key=True) + + name = db.Column(db.Unicode) + + entries = db.relationship("QueryHistoryEntry", back_populates="parent", passive_deletes=True, cascade='all, delete') + + + def add_entry(self, query_id, comment): + + entry = QueryHistoryEntry( + history_id = self.id, + query_id = query_id, + comment = comment + ) + + self.entries.append(entry) + + return entry + + +class QueryHistoryEntry(db.Model): + + history_id = db.Column(db.Integer, db.ForeignKey("query_history.id", ondelete="CASCADE"), primary_key=True) + query_id = db.Column(db.Integer, db.ForeignKey("query.id", ondelete="CASCADE"), primary_key=True) + + time = db.Column(db.DateTime, default=datetime.utcnow, primary_key=True) + comment = db.Column(db.Unicode) + + parent = db.relationship("QueryHistory", back_populates="entries") + query = db.relationship("Query") diff --git a/cads/spheroscope/library.py b/cads/spheroscope/library.py deleted file mode 100644 index 48f6be8b..00000000 --- a/cads/spheroscope/library.py +++ /dev/null @@ -1,136 +0,0 @@ -#! /usr/bin/env python -# -*- coding: utf-8 -*- - -import json -import os -from glob import glob - -import click -from ccc.cqpy import cqpy_load -from flask import Blueprint - -from .. import db -from .database import Macro, SlotQuery, WordList, WordListWords - -bp = Blueprint('library', __name__, url_prefix='/library', cli_group='library') - - -def ccc_get_library(slot_query, wordlists=[], macros=[]): - - crps = slot_query.corpus.ccc() - cqp = crps.start_cqp() - - for wordlist in wordlists: - name = wordlist.split('/')[-1].split('.')[0] - abs_path = os.path.abspath(wordlist) - cqp_exec = f'define ${name} < "{abs_path}";' - cqp.Exec(cqp_exec) - - # macros - for macro in macros: - abs_path = os.path.abspath(macro) - cqp_exec = f'define macro < "{abs_path}";' - cqp.Exec(cqp_exec) - # for wordlists defined in macros, it is necessary to execute the macro once - macros = cqp.Exec("show macro;").split("\n") - for macro in macros: - # NB: this yields !cqp.Ok() if macro is not zero-valent - cqp.Exec(macro.split("(")[0] + "();") - - cqp.Exec("set ParseOnly on;") - cqp.Exec('set PrettyPrint off;') - cqp.Exec("set SpheroscopeDebug on;") - cqp.Exec("set SpheroscopeDebug;") - - result = cqp.Exec(slot_query.cqp_query) - cqp.__del__() - - wordlists = list() - macros = list() - for line in result.split("\n"): - if line.startswith("WORDLIST"): - wordlists.append(line.split(" ")[-1]) - elif line.startswith("MACRO"): - macros.append(line.split(" ")[-1]) - - return { - 'wordlists': wordlists, - 'macros': macros - } - - -def import_macro(path): - - name = path.split("/")[-1].split(".")[0] - with open(path, "rt") as f: - macro = f.read().strip() - - macro = Macro( - name=name, - macro=macro, - comment='imported macro' - ) - db.session.add(macro) - db.session.commit() - macro.write() - - -def import_wordlist(path): - - name = path.split('/')[-1].split('.')[0] - with open(path, "rt") as f: - words = f.read().strip().split("\n") - - wordlist = WordList( - name=name, - comment='imported wordlist' - ) - db.session.add(wordlist) - db.session.commit() - for word in words: - db.session.add(WordListWords(wordlist_id=wordlist.id, word=word)) - db.session.commit() - wordlist.write() - - -def import_query(path, corpus_id): - - query = cqpy_load(path) - - slots = [{'slot': key, 'start': str(value[0]), 'end': str(value[1])} for key, value in query['anchors']['slots'].items()] - corrections = [{'anchor': str(key), 'correction': int(value)} for key, value in query['anchors']['corrections'].items()] - - slot_query = SlotQuery( - corpus_id=corpus_id, - cqp_query=query['cqp'], - name=query['meta']['name'], - _slots=json.dumps(slots), - _corrections=json.dumps(corrections) - ) - db.session.add(slot_query) - db.session.commit() - slot_query.write() - - -def import_library(lib_dir, corpus_id, username): - - paths_macros = glob(os.path.join(lib_dir, "macros", "*.txt")) - paths_wordlists = glob(os.path.join(lib_dir, "wordlists", "*.txt")) - paths_queries = glob(os.path.join(lib_dir, "queries", "*.cqpy")) - - for path in paths_macros: - import_macro(path) - for path in paths_wordlists: - import_wordlist(path) - for path in paths_queries: - import_query(path, corpus_id) - - -@bp.cli.command('import-library') -@click.option('--lib_dir', default='tests/library/') -def import_library_cmd(lib_dir): - - corpus_id = 1 # TODO - username = 'admin' - - import_library(lib_dir, corpus_id, username) diff --git a/cads/spheroscope/query_history.py b/cads/spheroscope/query_history.py new file mode 100644 index 00000000..4c860cdd --- /dev/null +++ b/cads/spheroscope/query_history.py @@ -0,0 +1,133 @@ +#! /usr/bin/env python +# -*- coding: utf-8 -*- + +from apiflask import APIBlueprint, Schema, abort +from apiflask.fields import Integer, String, Date, Nested +from apiflask.validators import OneOf +import json + +from flask import current_app + +from cads.query import QueryOut +from .database import QueryHistory, QueryHistoryEntry +from ..database import Corpus, Query +from ..users import auth +from .. import db + +bp = APIBlueprint('query_history', __name__, url_prefix='/query-history') + + +class QueryHistoryEntryIn(Schema): + + query_id = Integer() + + comment = String() + + +class QueryHistoryEntryOut(Schema): + + history_id = Integer() + query_id = Integer() + + time = String() + comment = String() + + query = Nested(QueryOut) + + +class QueryHistoryIn(Schema): + + name = String() + + +class QueryHistoryOut(Schema): + + id = Integer() + name = String() + entries = Nested(QueryHistoryEntryOut(many=True)) + + +@bp.post('/') +@bp.input(QueryHistoryIn) +@bp.output(QueryHistoryOut) +@bp.auth_required(auth) +def create(json_data): + """Create a new quey history. + + """ + + query_history = QueryHistory( + id = None, + name = json_data.get("name") + ) + + db.session.add(query_history) + db.session.commit() + + return QueryHistoryOut().dump(query_history), 200 + + +@bp.get('/') +@bp.output(QueryHistoryOut(many=True)) +@bp.auth_required(auth) +def get_all(): + """Get all query histories. + + """ + + histories = QueryHistory.query.all() + + return [QueryHistoryOut().dump(h) for h in histories], 200 + + +@bp.put("//entry") +@bp.input(QueryHistoryEntryIn) +@bp.output(QueryHistoryOut) +@bp.auth_required(auth) +def add_query(id, json_data): + """Create a new entry in a given query history. + + """ + + history = db.get_or_404(QueryHistory, id) + + entry = history.add_entry( + json_data.get("query_id"), + json_data.get("comment") + ) + + try: + db.session.commit() + return QueryHistoryOut().dump(history), 200 + except: + return abort(400, message=f"Query with id {json_data.get('query_id')} does not exist in database") + + +@bp.get("/") +@bp.output(QueryHistoryOut) +@bp.auth_required(auth) +def get_history(id): + """Get all entries in a given query history. + + """ + + history = db.get_or_404(QueryHistory, id) + + return QueryHistoryOut().dump(history), 200 + + +@bp.delete("/") +@bp.auth_required(auth) +def delete_history(id): + """ Deletes a given query history + + """ + + history = db.get_or_404(QueryHistory, id) + + try: + db.session.delete(history) + db.session.commit() + return f"Deleted QueryHistory with id {id} successfully", 200 + except: + return abort(400, message=f"QueryHistory with id {id} could not be deleted") diff --git a/cads/spheroscope/slot_query.py b/cads/spheroscope/slot_query.py index acd0d38e..dbcf2550 100644 --- a/cads/spheroscope/slot_query.py +++ b/cads/spheroscope/slot_query.py @@ -2,20 +2,75 @@ # -*- coding: utf-8 -*- import json +import re +import os from apiflask import APIBlueprint, Schema from apiflask.fields import Integer, Nested, String from apiflask.validators import OneOf -from flask import current_app +from ccc.cqpy import cqpy_load + +from flask import current_app as app from .. import db -from ..database import Corpus +from ..database import Corpus, Macro, MacroCall, WordList, WordListCall from ..users import auth from .database import SlotQuery +from ..query import ccc_query + bp = APIBlueprint('slot_query', __name__, url_prefix='/slot-query') +def ccc_get_library(slot_query, wordlists=[], macros=[]): + + """TODO find out the exact purpose of this function + + This function runs a slot query and dumps the macros and word lists + defined within it?""" + + crps = slot_query.corpus.ccc() + cqp = crps.start_cqp() + + for wordlist in wordlists: + name = wordlist.split('/')[-1].split('.')[0] + abs_path = os.path.abspath(wordlist) + cqp_exec = f'define ${name} < "{abs_path}";' + cqp.Exec(cqp_exec) + + # macros + for macro in macros: + abs_path = os.path.abspath(macro) + cqp_exec = f'define macro < "{abs_path}";' + cqp.Exec(cqp_exec) + # for wordlists defined in macros, it is necessary to execute the macro once + macros = cqp.Exec("show macro;").split("\n") + for macro in macros: + # NB: this yields !cqp.Ok() if macro is not zero-valent + cqp.Exec(macro.split("(")[0] + "();") + + cqp.Exec("set ParseOnly on;") + cqp.Exec('set PrettyPrint off;') + cqp.Exec("set SpheroscopeDebug on;") + cqp.Exec("set SpheroscopeDebug;") + + result = cqp.Exec(slot_query.cqp_query) + cqp.__del__() + + wordlists = list() + macros = list() + for line in result.split("\n"): + if line.startswith("WORDLIST"): + wordlists.append(line.split(" ")[-1]) + elif line.startswith("MACRO"): + macros.append(line.split(" ")[-1]) + + return { + 'wordlists': wordlists, + 'macros': macros + } + + def ccc_slot_query(slot_query): crps = slot_query.corpus.ccc() @@ -37,16 +92,58 @@ def ccc_slot_query(slot_query): # invalid query if isinstance(dump, str): - current_app.logger.error('invalid query') + app.logger.error('invalid query') return dump # valid query, but no matches if len(dump.df) == 0: - current_app.logger.warning(f'no results for query {slot_query.id}') + app.logger.warning(f'no results for query {slot_query.id}') return dump.df +def import_slot_query(path, corpus_id): + + query = cqpy_load(path) + + + anchors = query.get('anchors') + slots = [] + corrections = [] + if anchors: + if anchors.get('slots'): + slots = [{'slot': key, 'start': str(value[0]), 'end': str(value[1])} for key, value in query['anchors']['slots'].items()] + if anchors.get('corrections'): + corrections = [{'anchor': str(key), 'correction': int(value)} for key, value in query['anchors']['corrections'].items()] + + app.logger.debug(f"importing SlottedQuery {query['meta']['name']}") + + corpus = db.get_or_404(Corpus, corpus_id) + + try: + slot_query = SlotQuery( + corpus_id=corpus.id, + cqp_query=query['cqp'], + name=query['meta']['name'], + _slots=json.dumps(slots) if slots else None, + _corrections=json.dumps(corrections) if corrections else None, + s=corpus.s_default + ) + db.session.add(slot_query) + except Exception as e: + app.logger.error(f"could not init query: {e}") + return + finally: + db.session.commit() + + ## dump to library directory + # TODO: is this really necessary? it just basically duplicates the input and is never read by the backend + slot_query.write() + + # execute query to cache results (?) + # ccc_query(slot_query, return_df=False) + + class AnchorCorrection(Schema): anchor = String() diff --git a/makefile b/makefile index 5ed3f938..15417a9b 100644 --- a/makefile +++ b/makefile @@ -31,6 +31,9 @@ apispec: # DEVELOPMENT # ############### +clean: + rm -rf ./instance/ + init: . venv/bin/activate && \ export CWB_CADS_CONFIG=${config} && \ @@ -53,8 +56,8 @@ discoursemes: library: . venv/bin/activate && \ - export CWB_CADS_CONFIG=${config} && \ - flask --app cads library import-library --lib_dir ${library} + export CWB_CADS_CONFIG=cfg.DevConfig && \ + flask --app cads library import-library --corpus_id 1 --lib_dir "tests/library/" examples: init corpora discoursemes diff --git a/tests/conftest.py b/tests/conftest.py index 118a6322..60a137ed 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -7,7 +7,7 @@ from cads.corpus import meta_from_within_xml, read_corpora, subcorpora_from_tsv from cads.database import init_db from cads.mmda.discourseme import import_discoursemes -from cads.spheroscope.library import import_library +from cads.library import import_library app = create_app('cfg.TestConfig') diff --git a/tests/library/macros/annoying_overloading.txt b/tests/library/macros/annoying_overloading.txt new file mode 100644 index 00000000..8bef80ae --- /dev/null +++ b/tests/library/macros/annoying_overloading.txt @@ -0,0 +1,34 @@ +MACRO annoying(1) +( + [lemma = "$0"] +) +; + +MACRO annoying(2) +( + [lemma = "$0"] + [lemma = "$1"] +) +; + +MACRO annoying(3) +( + [lemma = "$0"] + [lemma = "$1"] + [lemma = "$2"] +) +; + +MACRO annoying_nested(4) +( + /annoying[$0, $1] + /annoying[$2, $3] +) +; + +MACRO annoying_nested2(3) +( + /annoying[$0] + /annoying[$1, $2] +) +; diff --git a/tests/library/macros/np.txt b/tests/library/macros/np_entity.txt similarity index 52% rename from tests/library/macros/np.txt rename to tests/library/macros/np_entity.txt index 0b3bec68..d606cd89 100644 --- a/tests/library/macros/np.txt +++ b/tests/library/macros/np_entity.txt @@ -1,15 +1,7 @@ -MACRO np_generic(0) -( -([pos="ART|ADJ.+|P[DI]AT|PPOSAT"]* -[pos="N.+|PPER|PPOSS" & word!=","]+) ("und|,|\-"([pos="ART|ADJ.+|P[DI]AT|PPOSAT"]* -[pos="N.+|PPER|PPOSS" & word!=","]+))* -) -; - -MACRO np_entity(0) -( -([pos="ART|ADJ.+|P[DI]AT|PPOSAT"]* -[pos="PPER|PPOSS|NE" | pos="NN" & lemma=$nouns_entity]+) ("und|,|\-"([pos="ART|ADJ.+|P[DI]AT|PPOSAT"]* -[pos="PPER|PPOSS|NE" | pos="NN" & lemma=$nouns_entity]+))* -) -; \ No newline at end of file +MACRO np_entity(0) +( +([pos="ART|ADJ.+|P[DI]AT|PPOSAT"]* +[pos="PPER|PPOSS|NE" | pos="NN" & lemma=$nouns_entity]+) ("und|,|\-"([pos="ART|ADJ.+|P[DI]AT|PPOSAT"]* +[pos="PPER|PPOSS|NE" | pos="NN" & lemma=$nouns_entity]+))* +) +; diff --git a/tests/library/macros/np_generic.txt b/tests/library/macros/np_generic.txt new file mode 100644 index 00000000..c09be398 --- /dev/null +++ b/tests/library/macros/np_generic.txt @@ -0,0 +1,7 @@ +MACRO np_generic(0) +( +([pos="ART|ADJ.+|P[DI]AT|PPOSAT"]* +[pos="N.+|PPER|PPOSS" & word!=","]+) ("und|,|\-"([pos="ART|ADJ.+|P[DI]AT|PPOSAT"]* +[pos="N.+|PPER|PPOSS" & word!=","]+))* +) +; diff --git a/tests/library/macros/phrases.txt b/tests/library/macros/phrases.txt new file mode 100644 index 00000000..d4eebab1 --- /dev/null +++ b/tests/library/macros/phrases.txt @@ -0,0 +1,21 @@ +MACRO adjp() + [pos = "RB.*"]? + [pos = "JJ.*"] +; + +MACRO np($0=N_Adj) + [pos = "DT"] + ( /adjp[] ){$0} + [pos = "NNS?"] +; + +MACRO np($0=Noun $1=N_Adj) + [pos = "DT"] + ( /adjp[] ){$1} + [(pos = "NN") & (lemma = "$0")] +; + +MACRO pp($0=Prep $1=N_Adj) + [(word = "$0") & (pos = "IN|TO")] + /np[$1] +; diff --git a/tests/library/queries/broken_query.cqpy b/tests/library/queries/broken_query.cqpy new file mode 100644 index 00000000..f9249193 --- /dev/null +++ b/tests/library/queries/broken_query.cqpy @@ -0,0 +1,19 @@ +# --- # CQPY query file +# anchors: +# corrections: +# 1: -1 +# 3: -1 +# slots: +# '0': +# - 0 +# - 1 +# '1': +# - 2 +# - 3 +# meta: +# comment: null +# name: broken_query +# pattern: 52 +# --- + +[lemma=$does_not_exist] diff --git a/tests/library/queries/overloaded.cqpy b/tests/library/queries/overloaded.cqpy new file mode 100644 index 00000000..e8a14a69 --- /dev/null +++ b/tests/library/queries/overloaded.cqpy @@ -0,0 +1,7 @@ +# --- # CQPY query file +# meta: +# comment: null +# name: overloaded +# --- + +/annoying["Arbeit"] /annoying["zu", "leisten"] diff --git a/tests/library/queries/overloaded2.cqpy b/tests/library/queries/overloaded2.cqpy new file mode 100644 index 00000000..6d05766b --- /dev/null +++ b/tests/library/queries/overloaded2.cqpy @@ -0,0 +1,7 @@ +# --- # CQPY query file +# meta: +# comment: null +# name: overloaded +# --- + +/annoying_nested["Arbeit", "zu", "leisten", "haben"] diff --git a/tests/library/queries/overloaded3.cqpy b/tests/library/queries/overloaded3.cqpy new file mode 100644 index 00000000..6a604968 --- /dev/null +++ b/tests/library/queries/overloaded3.cqpy @@ -0,0 +1,7 @@ +# --- # CQPY query file +# meta: +# comment: null +# name: overloaded2 +# --- + +/annoying_nested2["Arbeit", "zu", "leisten"]