Merge pull request #52 from ausgerechnet/v0.11.2

v0.11.2
ausgerechnet · Nov 25, 2022 · b5f67f7 · b5f67f7
2 parents 80d758e + 8523bbe
commit b5f67f7
Show file tree

Hide file tree

Showing 11 changed files with 794 additions and 526 deletions.
diff --git a/CITATION.cff b/CITATION.cff
@@ -0,0 +1,10 @@
+cff-version: 1.2.0
+message: "If you use this software, please cite it as below."
+authors:
+- family-names: "Heinrich"
+  given-names: "Philipp"
+  orcid: "https://orcid.org/0000-0002-4785-9205"
+title: "cwb-ccc"
+version: 0.11.2
+date-released: 2022-11-25
+url: "https://github.com/ausgerechnet/cwb-ccc"
diff --git a/Pipfile b/Pipfile
@@ -5,7 +5,7 @@ verify_ssl = true
 
 [dev-packages]
 cython = "==0.29.30"
-pytest = "==7.0.1"
+pytest = "==7.2.0"
 pylint = "==2.13.9"
 pytest-cov = "==3.0.0"
 tabulate = "==0.8.9"

diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/ccc/concordances.py b/ccc/concordances.py
@@ -7,7 +7,7 @@
 """
 import itertools
 import logging
-from random import sample
+from random import sample, seed
 
 # requirements
 from pandas import DataFrame
@@ -287,6 +287,9 @@ def lines(self, form='simple', p_show=['word'], s_show=[],
         if (cut_off is None) or (len(matches) < cut_off):
             cut_off = len(matches)
         # order
+        if isinstance(order, int):
+            seed(order)
+            order = 'random'
         if order == 'random':
             matches = sample(list(matches), cut_off)
         elif order == 'first':

diff --git a/ccc/cwb.py b/ccc/cwb.py
@@ -18,16 +18,21 @@
 # part of module
 from .cache import Cache, generate_idx, generate_library_idx
 from .cl import Corpus as Attributes
+from .concordances import Concordance
 from .counts import Counts, cwb_scan_corpus
 from .cqp import CQP
 from .dumps import Dump
-from .utils import chunk_anchors, correct_anchors, preprocess_query
+from .utils import (chunk_anchors, correct_anchors, dump_left_join,
+                    format_roles, group_lines, preprocess_query, aggregate_matches)
 from .version import __version__
 
 logger = logging.getLogger(__name__)
 
 
 def decode(text):
+    """savely decode a string catching common errors
+
+    """
     try:
         text = text.decode('utf-8')
     except (UnicodeDecodeError, AttributeError):
@@ -1101,8 +1106,7 @@ def query_cqp(self, cqp_query, context=20, context_left=None,
         )
 
         # if dump has been retrieved from cache, NQR might not exist
-        if self.show_nqr().empty or \
-           name not in self.show_nqr()['subcorpus'].values:
+        if save and (self.show_nqr().empty or name not in self.show_nqr()['subcorpus'].values):
             # undump the dump and save to disk
             cqp = self.start_cqp()
             cqp.nqr_from_dump(df_dump, name)
@@ -1160,3 +1164,177 @@ def query(self, cqp_query=None, context=20, context_left=None,
 
         else:
             raise NotImplementedError()
+
+    def quick_query(self, s_context, topic_query="", filter_queries=[], match_strategy='longest'):
+        """makes sure query result is defined as subcorpus.
+
+        without topic query:
+        - finds all s_context spans that contain at least one filter_query
+        with topic query:
+        - finds all s_context spans that contain topic_query and all filter_queries
+
+        :return: identifier (name of NQR on disk)
+        :rtype: str
+
+        """
+
+        if len(topic_query) == 0:
+            identifier = generate_idx([self.subcorpus, filter_queries, s_context, match_strategy], prefix='Query')
+
+            cqp = self.start_cqp()
+            cqp.Exec(f'set MatchingStrategy "{match_strategy}";')
+            size = int(cqp.Exec(f'size {identifier};'))
+
+            if size == 0:
+                disjunction = " | ".join(['(' + q + ')' for q in filter_queries])
+                logger.info(f'disjunction query: {disjunction}')
+                cqp.Query(f'{identifier} = {disjunction} within {s_context} expand to {s_context};')
+                logger.info(f'.. saving {identifier} in CWB binary format')
+                cqp.Exec(f'save {identifier};')
+
+            return identifier
+
+        # IDENTIFY
+        topic_identifier = generate_idx([self.subcorpus, topic_query, s_context, match_strategy], prefix='Query')
+        filter_identifier = generate_idx([self.subcorpus, topic_query, s_context, match_strategy, filter_queries], prefix='Query')
+
+        # CHECK CQP
+        cqp = self.start_cqp()
+        cqp.Exec(f'set MatchingStrategy "{match_strategy}";')
+        size = int(cqp.Exec(f'size {filter_identifier};'))
+
+        if size == 0:
+
+            # TODO: avoid saving twice if there's no filter
+            size = int(cqp.Exec(f'size {topic_identifier};'))
+
+            logger.info(f'topic query: {topic_query}')
+            if size == 0:
+                # TOPIC
+                cqp.Query(f'{topic_identifier} = {topic_query} expand to {s_context};')
+                logger.info(f'.. saving {topic_identifier} in CWB binary format')
+                cqp.Exec(f'save {topic_identifier};')
+            logger.info('.. size: ' + cqp.Exec(f'size {topic_identifier};'))
+
+            # FILTER
+            cqp.Exec(f'{filter_identifier} = {topic_identifier};')
+            for query in filter_queries:
+                logger.info(f'filter query: {query}')
+                cqp.Exec(f'{filter_identifier};')
+                cqp.Query(f'{filter_identifier} = {query} expand to {s_context};')
+                logger.info('.. size: ' + cqp.Exec(f'size {filter_identifier};'))
+
+            # SAVE
+            logger.info(f'.. saving {filter_identifier} in CWB binary format')
+            cqp.Exec(f'save {filter_identifier};')
+
+        cqp.__kill__()
+
+        return filter_identifier
+
+    def quick_conc(self, topic_query, s_context, window, order=42,
+                   cut_off=100, highlight_queries=dict(),
+                   filter_queries=dict(), p_show=['word'], s_show=[],
+                   match_strategy='longest'):
+        """
+
+        :return: concordance lines, each one a dict
+        :rtype: list(dict)
+        """
+
+        if len(topic_query) == 0:
+
+            queries = {**highlight_queries, **filter_queries}
+
+            # INIT CQP
+            identifier = self.quick_query(s_context, topic_query="", filter_queries=queries.values(), match_strategy=match_strategy)
+            cqp = self.start_cqp()
+
+            # init CONTEXT (TextConstellation)
+            cqp.Exec(f'cut {identifier} {cut_off};')
+            df_context = cqp.Dump(f'{identifier};')
+            dump_context = Dump(self.copy(), df_context, None)
+            dump_context = dump_context.set_context(context_break=s_context)
+            df_context = dump_context.df[['contextid']]
+            df_context = df_context.reset_index().set_index('contextid')
+
+            # HIGHLIGHT
+            cqp.Exec(f'{identifier};')
+            for name, query in queries.items():
+                cqp.Exec(f'Temp = {query};')
+                df_query = cqp.Dump('Temp;')
+                if len(df_query) > 0:
+                    dump_query = Dump(self.copy(), df_query, None)
+                    dump_query = dump_query.set_context(context_break=s_context)
+                    df_query = dump_query.df[['contextid']]
+                    df_agg = aggregate_matches(df_query, name)
+                    df_context = df_context.join(df_agg)
+                else:
+                    df_context[name] = None
+                    df_context[name + '_BOOL'] = False
+                    df_context[name + '_COUNTS'] = 0
+            cqp.__kill__()
+
+            # index by CONTEXT MATCHES
+            df = df_context.set_index(['match', 'matchend'])
+            names = list(queries.keys())
+            names_bool = [n + '_BOOL' for n in names]
+            names_count = [n + '_COUNTS' for n in names]
+            for b, c in zip(names_bool, names_count):
+                df[b] = df[b].fillna(False)
+                df[c] = df[c].fillna(0)
+
+            # ACTUAL CONCORDANCING
+            conc = Concordance(self.copy(), df)
+            lines = conc.lines(form='dict', p_show=p_show, s_show=s_show, order=order, cut_off=cut_off)
+            output = lines.apply(lambda row: format_roles(row, names, s_show=names_bool+s_show, window=0, htmlify_meta=True), axis=1)
+
+        else:
+
+            # INIT CQP
+            identifier = self.quick_query(s_context, topic_query, filter_queries.values(), match_strategy)
+            cqp = self.start_cqp()
+
+            # init CONTEXT (TextConstellation)
+            cqp.Exec(f'{identifier};')
+            df_context = cqp.Dump(f'{identifier};')
+            dump_context = Dump(self.copy(), df_context, None)
+            dump_context = dump_context.set_context(window, s_context)
+            df_context = dump_context.df[['contextid', 'context', 'contextend']]
+
+            # index by TOPIC MATCHES
+            cqp.Exec(f'Temp = {topic_query};')
+            df_query = cqp.Dump('Temp;')
+            dump_query = Dump(self.copy(), df_query, None)
+            dump_query = dump_query.set_context(window, s_context)
+            df_context = dump_left_join(df_context, dump_query.df, 'topic', drop=True, window=window)
+            df_context = df_context.set_index(['match_topic', 'matchend_topic'])
+            df_context.index.names = ['match', 'matchend']
+
+            # FILTER according to window size
+            for name, query in filter_queries.items():
+                cqp.Exec(f'Temp = {query};')
+                df_query = cqp.Dump('Temp;')
+                dump_query = Dump(self.copy(), df_query, None)
+                dump_query = dump_query.set_context(window, s_context)
+                df_context = dump_left_join(df_context, dump_query.df, name, drop=True, window=window)
+                df_context = df_context.drop([c + "_" + name for c in ['match', 'matchend', 'offset']], axis=1)
+
+            # HIGHLIGHT
+            for name, query in highlight_queries.items():
+                cqp.Exec(f'Temp = {query};')
+                df_query = cqp.Dump('Temp;')
+                dump_query = Dump(self.copy(), df_query, None)
+                dump_query = dump_query.set_context(window, s_context)
+                df_context = dump_left_join(df_context, dump_query.df, name, drop=False, window=window)
+
+            cqp.__kill__()
+
+            # ACTUAL CONCORDANCING
+            hkeys = list(highlight_queries.keys())
+            df = group_lines(df_context, hkeys)
+            conc = Concordance(self.copy(), df)
+            lines = conc.lines(form='dict', p_show=p_show, s_show=s_show, order=order, cut_off=cut_off)
+            output = lines.apply(lambda row: format_roles(row, hkeys, s_show, window, htmlify_meta=True), axis=1)
+
+        return list(output)