From 061ae65026ddb6c74155d43fd62b5935d5b81150 Mon Sep 17 00:00:00 2001 From: Philipp Heinrich Date: Sat, 4 Dec 2021 17:06:14 +0100 Subject: [PATCH 01/13] upgrade to assoc-measures 0.2.0 --- Pipfile | 2 +- Pipfile.lock | 90 +++++++++++++++++---------------------- ccc/collocates.py | 25 +++++------ ccc/counts.py | 86 ++++--------------------------------- ccc/keywords.py | 20 +++++---- setup.py | 2 +- tests/test_07_counts.py | 17 +++++++- tests/test_09_keywords.py | 28 ++++++------ tests/test_10_dumps.py | 3 +- 9 files changed, 106 insertions(+), 167 deletions(-) diff --git a/Pipfile b/Pipfile index 2ac661f..1134131 100644 --- a/Pipfile +++ b/Pipfile @@ -18,7 +18,7 @@ sphinx = "*" enthought-sphinx-theme = "*" [packages] -association-measures = ">=0.1.6" +association-measures = ">=0.2.0" pandas = ">=1.2.0" numexpr = ">=2.7.1" Bottleneck = ">=1.3.2" diff --git a/Pipfile.lock b/Pipfile.lock index d1303d2..9a94701 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "483ee8eadaab22c4244579ea697b881701c0687f959bbf6b129cbf4b92dfee74" + "sha256": "e68175e004b516d943e76c8ecb6a1c9f5d7ebeb64480355a8af6810def854125" }, "pipfile-spec": 6, "requires": {}, @@ -16,10 +16,10 @@ "default": { "association-measures": { "hashes": [ - "sha256:bd34516a7741b2b6f224f69fa2575ea650a99967f61f015878aff7d473be9c90" + "sha256:4dd35cbc4b572d82b76aeb3bbab3d6068bf378a0541cb4c4cf2688a508936a89" ], - "index": "pypi", - "version": "==0.1.7" + "path": "./../pandas-association-measures/dist/association-measures-0.2.0.tar.gz", + "version": "==0.2.0" }, "bottleneck": { "hashes": [ @@ -30,47 +30,35 @@ }, "numexpr": { "hashes": [ - "sha256:05b97b19e864a5d1a0b106933b1637233a2444fd375685bead264a818f847ef2", - "sha256:0732c9989bff8568ee78fa461f3698166d4ac79363860be22ff49eae1dcd15e7", - "sha256:23718ac5f2ebae995f5899509624781b375da568f2b645b5d1fd6dbb17f41a56", - "sha256:24cdb8c0e93f31387a4c2ddd09a687874c006e6139fd68bcf77b96e51d17cb01", - "sha256:2e14b44a79030fbe25f16393162a4d21ced14056fac49ff73856f661a78db731", - "sha256:3daa55515ee3cb40bf5ab8263c0c13fff8d484d64d107a9c414e8ca151dc08a6", - "sha256:43616529f9b7d1afc83386f943dc66c4da5e052f00217ba7e3ad8dd1b5f3a825", - "sha256:4527a0a7b04f858a73c348c9c4ce8441b7a54965db74a32ba808c51d9d53b7cd", - "sha256:51277a530a353e0f94665b44615249d7e7075f0c73f78d4743da632fc44bc648", - "sha256:5223a519f48754dd350723d9fbcadbcd0476881bc954a281a09a6538ecabfc27", - "sha256:5d6dbf050a9b8ebff0b7706ebeaf1cd57d64ef4dfe61aef3790851b481daf6b5", - "sha256:5f4122bd58aa4e4891814c2f72bd47b1cdb202c9d863ea96c5394dffb72a16e2", - "sha256:602df9b5c500d0a887dc96b4cfd16fb60ae7ef39ccd6f013f4df2ee11ae70553", - "sha256:618259287b8b81a352a7d088ad03fe3b393a842ccb45f0b3cfc6a712d41b7595", - "sha256:74df157ab4577bfc83c14f4e39d14781b06ade5406d3efef049f90c88d8c28ea", - "sha256:785065819ce98e3d3dd853794244e0de190d7ba36ab42c8fd79e0e9cd40de7af", - "sha256:7ab40e2b438f4ea2ea8234c63639cdf5072cdb29d0ac521307854efe0281a567", - "sha256:833a363c86266424349467b53f4060f77aaa7ec03c1e6f38c54e69c65ceebf30", - "sha256:8b76bcca930cbf0db0fe98b6a51d6286dff77d525dad670cb7750e29a138d434", - "sha256:8fc23a49f4266c24a23310c0cb92ff54c4b4f535635f90372b3a2d5cb1f83329", - "sha256:90ea6d5813e1906bb203ef220a600b30d83e75aea2607a7e7037cceae9e93346", - "sha256:97753d17d1ea39e082b1907b99b6cb63cac7d1dfa512d2ff5079eb7bfab1ea88", - "sha256:99472731bc1111f5d73285dd2a4c228b5bfb176f785a567872e0fbfec6584f2b", - "sha256:a3f1cec8657bd3920869a2ea27f98d68ac3000334f366d844a9670ae671fe4bd", - "sha256:a8e0e48d72391543b68d0471fac2e31c614efdce4036e2a0a8a182fde1edb0e0", - "sha256:aae4ce158da53ebc47df053de90fed9d0d51fa0df8cc481abc8a901ea4f0cec7", - "sha256:b0a9124a66a61b05ea84b832358d6aa5561c30e69b4dcaea819b296f4f025f89", - "sha256:c2605e5665b0d7362e0d2b92683387c12e15c7440daf702a7637f7502a967810", - "sha256:c9218aeb76717768f617362b72a87e9219da95ba7cdec0732ccecc4a4719124c", - "sha256:c978c49bd9dded6a4ba6b3501e3a34e3aba9312cbb7d800bed7ac6fcd2d5949d", - "sha256:d14ae09318ad86579e35aacf1596c83d5db1139cd68615967ee23605e11f5d82", - "sha256:d423441593a952ac56d1f774068b81fb22f514fb68873c066578345a6af74c0d", - "sha256:dc707486b1f3dda18a39bc4d06a0a09d3c0ea47bd6b99fdb98adb26d1277253f", - "sha256:dfdca3d1f4c83fa8fd3ee7573110efd13e838543896641b89367622ec6a67eb4", - "sha256:e000570a6a704c594832ff4fc45f18864b721b7b444a185b365dbb03d3fe3abb", - "sha256:e985026e64350dd59fd91a09bc364edf706d58b12e01362ddfa63829878bd434", - "sha256:eeeb6325df6cf3f3ab7d9dbabf3bc03ac88b7e2f2aed21419c31e23c3048dce1", - "sha256:f9df0a74d39616fd011071c5850418f244bac414f24ed55c00dcf3c5385e8374" + "sha256:078ae8c6b7028a893f72bd8a88cdda1f12e08dfa1eb67783a41da360524ca5dc", + "sha256:09b69583b694466c2f02311a69c35e35e59fed2842decd8926aaf6466478fad6", + "sha256:0bdec52a27cc7945cf58650f585bdc3e9089f097190c31cd061dd80b864484bb", + "sha256:1150ffee400f23b413c07ed739e2d7e63df2049739535916972769c86f3feb09", + "sha256:142dabece2a223cf744f0afe633016ae56d4992a0798258e62da5b503e87ebed", + "sha256:217e04783abaedfc8173f3f0aef555db5ce81c210e23f3ea695cc523b41da638", + "sha256:2c8f548690a302db859f5b79c057d8fe552f95c0f40f36804f9bda02ec660917", + "sha256:2d8cc7181c6bbe315781aed5895a4a5e90d161841ded1deb792be1fd2b1a539f", + "sha256:3b429e69572e57ce7476de3bac984f237d980b28bfce72cb610ade0e7ae6442d", + "sha256:3be075577725c23bf1e50501ecc095421ac4e3e5f75bd4842dd1e928cd4f4bdd", + "sha256:4979cdea3814dd4519cf413c13ed705114c41f4d5a70d490ef0f4904255e93fd", + "sha256:4e74eb1ccdd96b7726b34b018c0f210486dacc3de86bb86b808b1abd25b535e6", + "sha256:6e0be6495b92447459f1f0c6b270624a489a1e434a9c8f68950e1450cb825f38", + "sha256:6e0cc5d65b02eff4aa3d37da005597871915b977800892c149b82495c141b630", + "sha256:7143a7e0a5db48d4302e12280f04a886c367f8cea19dccd009be6bbefa4bbc0c", + "sha256:7c604b695f07e49e6ac58a2eb9dca0dae8b9cb115e3cd6148b28c7be1c49249d", + "sha256:9c99f207cef209f5dd42c7f389753e9ace24b9e306a278264581b5390dd9e7d6", + "sha256:9fec076b76c90a5f3929373f548834bb203c6d23a81a895e60d0fe9cca075e99", + "sha256:a96293ab134d09aeb3b8b09cf2745ed1debbf3a3aa7071bcca93c9d0c9dcf3f9", + "sha256:ab63acfbe55c05f33dc5d33fde1449434b458a40a1ee9324897c27dc0629794d", + "sha256:b70369863c5f1fcee22ef8979317067e2baa84659194c733632f58e89bab729f", + "sha256:cb2bee19e39c73110b9f12a83ffd1438c13e487ef2237985c0434c67243a4aa5", + "sha256:cebc549dbe99eeee4be9a6ab12f19366c14d88686041db3b97093aa60b5834be", + "sha256:d9caca66cfc5bfcba22e94414408bcd89c036d66c63fca313d5bfd6712d2b9de", + "sha256:edbb4cbd4c0a1976bd3dff856f88e6477dfd2d843e0927f85cdc964d5c8544f5", + "sha256:f7ae9a282bce5b5a184ddc1572bc23fa1e656fb45511a574bccb2b9aad907b1c" ], "index": "pypi", - "version": "==2.7.3" + "version": "==2.8.0" }, "numpy": { "hashes": [ @@ -348,11 +336,11 @@ }, "charset-normalizer": { "hashes": [ - "sha256:735e240d9a8506778cd7a453d97e817e536bb1fc29f4f6961ce297b9c7a917b0", - "sha256:83fcdeb225499d6344c8f7f34684c2981270beacc32ede2e669e94f7fa544405" + "sha256:1eecaa09422db5be9e29d7fc65664e6c33bd06f9ced7838578ba40d58bdf3721", + "sha256:b0b883e8e874edfdece9c28f314e3dd5badf067342e42fb162203335ae61aa2c" ], "markers": "python_version >= '3'", - "version": "==2.0.8" + "version": "==2.0.9" }, "colorama": { "hashes": [ @@ -538,7 +526,7 @@ "sha256:6f62d78e2f89b4500b080fe3a81690850cd254227f27f75c3a0c491a1f351ba7", "sha256:e8443a5e7a020e9d7f97f1d7d9cd17c88bcb3bc7e218bf9cf5095fe550be2951" ], - "markers": "python_version < '4' and python_full_version >= '3.6.1'", + "markers": "python_version < '4.0' and python_full_version >= '3.6.1'", "version": "==5.10.1" }, "jeepney": { @@ -685,10 +673,10 @@ }, "pkginfo": { "hashes": [ - "sha256:65175ffa2c807220673a41c371573ac9a1ea1b19ffd5eef916278f428319934f", - "sha256:bb55a6c017d50f2faea5153abc7b05a750e7ea7ae2cbb7fb3ad6f1dcf8d40988" + "sha256:542e0d0b6750e2e21c20179803e40ab50598d8066d51097a0e382cba9eb02bff", + "sha256:c24c487c6a7f72c66e816ab1796b96ac6c3d14d49338293d2141664330b55ffc" ], - "version": "==1.8.1" + "version": "==1.8.2" }, "pluggy": { "hashes": [ @@ -912,7 +900,7 @@ "sha256:4987c65554f7a2dbf30c18fd48778ef124af6fab771a377103da0585e2336ece", "sha256:c4fdf4019605b6e5423637e01bc9fe4daef873709a7973e195ceba0a62bbc844" ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'", + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4.0'", "version": "==1.26.7" }, "webencodings": { diff --git a/ccc/collocates.py b/ccc/collocates.py index 2baf024..2c053c2 100644 --- a/ccc/collocates.py +++ b/ccc/collocates.py @@ -8,7 +8,7 @@ from pandas import DataFrame # part of module -from .counts import score_counts_signature +from .counts import score_counts from .utils import node2cotext logger = logging.getLogger(__name__) @@ -72,15 +72,12 @@ def count(self, window): logger.info('slicing window %d' % window) relevant = self.df_cooc.loc[abs(self.df_cooc['offset']) <= window] - # number of possible occurrence positions within window - f1 = len(relevant) - # frequency counts f = self.corpus.counts.cpos(relevant['cpos'], self.p_query) - return f, f1 + return f - def show(self, window=5, order='O11', cut_off=100, ams=None, + def show(self, window=5, order='log_likelihood', cut_off=100, ams=None, min_freq=2, frequencies=True, flags=None, marginals='corpus'): @@ -89,8 +86,10 @@ def show(self, window=5, order='O11', cut_off=100, ams=None, logger.error("nothing to show") return DataFrame() - # get subcorpus frequencies - f, f1 = self.count(window) + # get window counts and apply min freq + f = self.count(window).rename(columns={'freq': 'f'}) + f1 = f['f'].sum() + f = f.loc[f['f'] >= min_freq] # get reference frequencies if isinstance(marginals, str): @@ -112,11 +111,13 @@ def show(self, window=5, order='O11', cut_off=100, ams=None, f2 = f2.fillna(0, downcast='infer') f2['f2'] = f2['marginal'] - f2['in_nodes'] + # create dataframe + df = f2.join(f) + df['f1'] = f1 + df['N'] = N + # score - collocates = score_counts_signature( - f[['freq']], f1, f2[['f2']], N, - min_freq, order, cut_off, flags, ams, frequencies - ) + collocates = score_counts(df, order, cut_off, flags, ams) if frequencies: # throw away anti-collocates by default diff --git a/ccc/counts.py b/ccc/counts.py index 93b869f..fc267f3 100644 --- a/ccc/counts.py +++ b/ccc/counts.py @@ -480,24 +480,16 @@ def mwus(self, cqp, queries, p_atts=None, fill_missing=True, strategy=1): return df -def score_counts(df1, df2, R1=None, R2=None, reference='right', - min_freq=2, order='log_likelihood', cut_off=1000, - flags=None, ams=None, freq=True, digits=6): - """calculate association measures for two frequency lists df1, df2 - with respective sizes R1, R2. - - :param DataFrame df1: counts per item in corpus 1 - :param DataFrame df2: counts per item in corpus 2 - :param int R1: number of items in df1 - :param int R2: number of items in df2 - - :param str reference: which dataframe is the reference? - :param int min_freq: minimum number of occurrences in df1 +def score_counts(df, order='log_likelihood', cut_off=1000, + flags=None, ams=None, digits=6): + """score counts in DataFrame. + + :param DataFrame df: DataFrame with reasonably-named columns, index 'item' + :param str order: association measure for sorting (in descending order) :param int cut_off: number of items to retrieve :param str flags: '%c' / '%d' / '%cd' (cwb-ccc algorithm) :param list ams: association measures to calculate (None=all) - :param bool freq: include absolute and relative frequencies? :param int digits: round dataframe :return: scored counts @@ -505,80 +497,18 @@ def score_counts(df1, df2, R1=None, R2=None, reference='right', """ - # which one should be treated as reference? - if reference == 'left': - return score_counts(df2, df1, R2, R1, reference='left', - order=order, cut_off=cut_off, - flags=flags, ams=ams, freq=freq, - digits=digits) - logger.info('creating table of association measures') - # preprocess - df1.columns = ['O11'] - df2.columns = ['O21'] - - # get corpus sizes if necessary - R1 = df1['O11'].sum() if R1 is None else R1 - R2 = df2['O21'].sum() if R2 is None else R2 - - # join dataframes respecting min_freq - if min_freq == 0: - df = df1.join(df2, how='outer') - else: - df1 = df1.loc[df1['O11'] >= min_freq] - df = df1.join(df2, how='left') - df = df.fillna(0, downcast='infer') - # post-processing: fold items df = fold_df(df, flags) # calculate association - df["O12"] = R1 - df["O11"] - df["O22"] = R2 - df["O21"] - df = measures.calculate_measures(df, freq=freq) - - if freq: - # add instances per million - df['ipm'] = df['O11'] / R1 * 1000000 - df['ipm_expected'] = df['E11'] / R1 * 1000000 - df['ipm_reference'] = df['O21'] / R2 * 1000000 - df['ipm_reference_expected'] = df['E21'] / R2 * 1000000 + df = measures.score(df, measures=ams, freq=True, digits=digits) # sort - df = df.sort_values(by=[order, 'O11', 'O12'], ascending=False) + df = df.sort_values(by=[order, 'item'], ascending=False) # apply cut-off df = df.head(cut_off) if cut_off is not None else df - # round - df = round(df, digits) if digits is not None else df - return df - - -def score_counts_signature(f, f1, f2, N, min_freq=2, - order='log_likelihood', cut_off=1000, - flags=None, ams=None, freq=True, digits=6): - """wrapper of score_counts for input in frequency signature notation. - - :param DataFrame f: co-occurrence freq. of token and node - :param int f1: number of tokens in W(node) - :param DataFrame f2: marginal freq. of tokens - :param int N: size of corpus - - :return: scored counts - :rtype: ScoreFrame - - """ - - f.columns = ['O11'] - f2.columns = ['C1'] - df = f.join(f2, how='outer').fillna(0, downcast='infer') - df['O21'] = df['C1'] - df['O11'] - - return score_counts( - f, df[['O21']], f1, N-f1, reference='right', min_freq=min_freq, - order=order, cut_off=cut_off, flags=flags, ams=ams, - freq=freq, digits=digits - ) diff --git a/ccc/keywords.py b/ccc/keywords.py index 318f387..d991b77 100644 --- a/ccc/keywords.py +++ b/ccc/keywords.py @@ -8,7 +8,7 @@ from pandas import DataFrame # part of module -from .counts import score_counts_signature +from .counts import score_counts logger = logging.getLogger(__name__) @@ -53,9 +53,10 @@ def show(self, order='f', cut_off=100, ams=None, logger.warning("nothing to show") return DataFrame() - # get subcorpus frequencies - f = self.counts.loc[self.counts['freq'] >= min_freq] - f1 = self.counts['freq'].sum() + # get subcorpus frequencies und apply min freq + f = self.counts.rename(columns={'freq': 'f'}) + f1 = f['f'].sum() + f = f.loc[f['f'] >= min_freq] # get reference frequency if isinstance(marginals, str): @@ -70,10 +71,13 @@ def show(self, order='f', cut_off=100, ams=None, else: raise NotImplementedError + # create dataframe + f2 = marginals[['freq']].rename(columns={'freq': 'f2'}) + df = f2.join(f) + df['f1'] = f1 + df['N'] = N + # score - keywords = score_counts_signature( - f[['freq']], f1, marginals[['freq']], N, - min_freq, order, cut_off, flags, ams, frequencies - ) + keywords = score_counts(df, order, cut_off, flags, ams) return keywords diff --git a/setup.py b/setup.py index 1ea27f5..7cf4bb8 100644 --- a/setup.py +++ b/setup.py @@ -68,7 +68,7 @@ def guess_cl_directory(): url="https://github.com/ausgerechnet/cwb-ccc", ext_modules=extensions, install_requires=[ - "association-measures>=0.1.6", + "association-measures>=0.2.0", "pandas>=1.2.0", "numexpr>=2.7.1", "Bottleneck>=1.3.2", diff --git a/tests/test_07_counts.py b/tests/test_07_counts.py index 0da95aa..5a31b61 100644 --- a/tests/test_07_counts.py +++ b/tests/test_07_counts.py @@ -1,5 +1,5 @@ from ccc.cwb import Corpus -from ccc.counts import cwb_scan_corpus, read_freq_list, cwb_lexdecode +from ccc.counts import cwb_scan_corpus, read_freq_list, cwb_lexdecode, score_counts from ccc.utils import format_cqp_query import pandas as pd import pytest @@ -431,3 +431,18 @@ def test_cwb_counts(germaparl): assert(df['freq'][queries[1]] == 55) cqp.__kill__() + + +def test_score_counts(germaparl, empirist): + + df1, R1 = read_freq_list(germaparl['freq_list']) + df2, R2 = read_freq_list(empirist['freq_list']) + df = df1[['freq']].rename(columns={'freq': 'f1'}).join( + df2[['freq']].rename(columns={'freq': 'f2'}) + ) + df['N1'] = R1 + df['N2'] = R2 + df = df.fillna(0, downcast='infer') + + kw = score_counts(df, cut_off=None) + assert kw['log_likelihood']['die'] == 4087.276827 diff --git a/tests/test_09_keywords.py b/tests/test_09_keywords.py index fbfeb00..cd912e7 100644 --- a/tests/test_09_keywords.py +++ b/tests/test_09_keywords.py @@ -1,6 +1,6 @@ from ccc import Corpus from ccc.keywords import Keywords -from ccc.counts import read_freq_list, score_counts +from ccc.counts import score_counts import pytest from .conftest import DATA_PATH @@ -89,23 +89,23 @@ def test_keywords_combo(germaparl): assert lines.index[0] == "und KON" -def test_score_counts(germaparl, empirist): - - df1, R1 = read_freq_list(germaparl['freq_list']) - df2, R2 = read_freq_list(empirist['freq_list']) - - kw = score_counts(df1[['freq']], df2[['freq']], R1, R2, cut_off=None) - assert kw['log_likelihood']['die'] == 4087.276827 - - @pytest.mark.now def test_keywords(germaparl): corpus = get_corpus(germaparl) - left = corpus.marginals(p_atts=['lemma', 'pos'])[['freq']] - right = corpus.marginals(p_atts=['lemma', 'pos'])[['freq']] - kw = score_counts(left, right) + left = corpus.marginals(p_atts=['lemma', 'pos'])[['freq']].rename( + columns={'freq': 'f1'} + ) + right = corpus.marginals(p_atts=['lemma', 'pos'])[['freq']].rename( + columns={'freq': 'f2'} + ) + + df = left.join(right).fillna(0, downcast='infer') + df['N1'] = df['f1'].sum() + df['N2'] = df['f2'].sum() + kw = score_counts(df) + kw = kw.sort_values(by='O11', ascending=False) - assert kw.iloc[0]['O11'] == 11469 + assert kw.iloc[0]['O11'] == 1095 assert kw.iloc[0]['conservative_log_ratio'] == 0 diff --git a/tests/test_10_dumps.py b/tests/test_10_dumps.py index 63621f2..8a64d60 100644 --- a/tests/test_10_dumps.py +++ b/tests/test_10_dumps.py @@ -207,7 +207,8 @@ def test_dumps_collocates(germaparl): window=20 ) assert len(tables) == len(parties) - assert tables['yellow'].index[0] == 'Freiheit' + print(tables['yellow']) + assert tables['yellow'].index[0] == 'Grad' def test_dumps_collocates_global(germaparl): From d9ba9e8cbe22a81f604e64805f942e0f109842a5 Mon Sep 17 00:00:00 2001 From: Philipp Heinrich Date: Tue, 7 Dec 2021 18:28:38 +0100 Subject: [PATCH 02/13] update Pipfile.lock --- Pipfile.lock | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/Pipfile.lock b/Pipfile.lock index 9a94701..c0b8ad7 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "e68175e004b516d943e76c8ecb6a1c9f5d7ebeb64480355a8af6810def854125" + "sha256": "0a9c872c3d252725d05b8b890b2498642da7a3bedb81052e995316a1721731a1" }, "pipfile-spec": 6, "requires": {}, @@ -16,9 +16,10 @@ "default": { "association-measures": { "hashes": [ + "sha256:15300c3481f949a934ec4e8c89a821114f5a8f894278dc8eff7dd0628690da1f", "sha256:4dd35cbc4b572d82b76aeb3bbab3d6068bf378a0541cb4c4cf2688a508936a89" ], - "path": "./../pandas-association-measures/dist/association-measures-0.2.0.tar.gz", + "index": "pypi", "version": "==0.2.0" }, "bottleneck": { @@ -526,7 +527,7 @@ "sha256:6f62d78e2f89b4500b080fe3a81690850cd254227f27f75c3a0c491a1f351ba7", "sha256:e8443a5e7a020e9d7f97f1d7d9cd17c88bcb3bc7e218bf9cf5095fe550be2951" ], - "markers": "python_version < '4.0' and python_full_version >= '3.6.1'", + "markers": "python_version < '4' and python_full_version >= '3.6.1'", "version": "==5.10.1" }, "jeepney": { @@ -889,18 +890,18 @@ }, "twine": { "hashes": [ - "sha256:5a3e3fb52b926827c99e050f0c1e5d8ae599848f3eb27764f19b886c09134590", - "sha256:8d6a0ad895576c97e9ad4a5da2d6adea37fd5434ecabace0054013d537ddbc6c" + "sha256:28460a3db6b4532bde6a5db6755cf2dce6c5020bada8a641bb2c5c7a9b1f35b8", + "sha256:8c120845fc05270f9ee3e9d7ebbed29ea840e41f48cd059e04733f7e1d401345" ], "index": "pypi", - "version": "==3.7.0" + "version": "==3.7.1" }, "urllib3": { "hashes": [ "sha256:4987c65554f7a2dbf30c18fd48778ef124af6fab771a377103da0585e2336ece", "sha256:c4fdf4019605b6e5423637e01bc9fe4daef873709a7973e195ceba0a62bbc844" ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4.0'", + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'", "version": "==1.26.7" }, "webencodings": { From 3deca4b30f1ae10a7bc726a9494e904fa79dd886 Mon Sep 17 00:00:00 2001 From: Philipp Heinrich Date: Tue, 7 Dec 2021 19:01:55 +0100 Subject: [PATCH 03/13] use of p_text and p_slots in cqpy.run_query --- ccc/cqpy.py | 14 +++++++++++++- tests/conftest.py | 3 +++ .../library/queries/jemand_sagt_display.cqpy | 17 +++++++++++++++++ tests/test_06_cqpy.py | 13 +++++++++++++ tests/test_09_keywords.py | 1 - 5 files changed, 46 insertions(+), 2 deletions(-) create mode 100644 tests/corpora/library/queries/jemand_sagt_display.cqpy diff --git a/ccc/cqpy.py b/ccc/cqpy.py index 87287af..f4332dc 100644 --- a/ccc/cqpy.py +++ b/ccc/cqpy.py @@ -243,7 +243,7 @@ def run_query(corpus, query, # backwards compatability for p in ['p_slots', 'p_text']: if p in query['display']: - logger.warning("use of '%s' is deprecated" % p) + # logger.warning("use of '%s' is deprecated" % p) if query['display'][p] not in p_show: p_show += [query['display'][p]] @@ -265,4 +265,16 @@ def run_query(corpus, query, form=form ) + # post-process: only return relevant columns + if 'display' in query: + if 'p_text' in query['display']: + drop = [p for p in p_show if p != query['display']['p_text']] + lines = lines.drop(drop, axis=1) + if 'p_slots' in query['display']: + drop = list() + for slot in slots.keys(): + drop = ["_".join([slot, p]) for p in p_show + if p != query['display']['p_slots']] + lines = lines.drop(drop, axis=1) + return lines diff --git a/tests/conftest.py b/tests/conftest.py index 758a707..6ee0b42 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -113,6 +113,9 @@ def query_files(): ), 'jemand_sagt_sloppy': os.path.join( DIR_PATH, "corpora", "library", "queries", "jemand_sagt_sloppy.cqpy" + ), + 'jemand_sagt_display': os.path.join( + DIR_PATH, "corpora", "library", "queries", "jemand_sagt_display.cqpy" ) } diff --git a/tests/corpora/library/queries/jemand_sagt_display.cqpy b/tests/corpora/library/queries/jemand_sagt_display.cqpy new file mode 100644 index 0000000..ceff2d6 --- /dev/null +++ b/tests/corpora/library/queries/jemand_sagt_display.cqpy @@ -0,0 +1,17 @@ +# --- # CQPY query file +# anchors: +# corrections: +# 1: -1 +# slots: +# entity: [match, 1] +# vp: [2, 3] +# proposition: [4, contextend] +# query: +# context: null +# context_break: s +# display: +# p_slots: lemma +# p_text: word +# --- + +/np[] @1:[::] @2:[::] [lemma != "sagen"]* @3:[lemma="sagen"] (("," "dass")? | ":") @4:[::] \ No newline at end of file diff --git a/tests/test_06_cqpy.py b/tests/test_06_cqpy.py index 50128b6..5f8bf62 100644 --- a/tests/test_06_cqpy.py +++ b/tests/test_06_cqpy.py @@ -45,3 +45,16 @@ def test_run_from_cqpy_sloppy(germaparl, query_files): assert lines[['five_word']].value_counts()[''] == 440 assert lines[['five_word']].value_counts()['nichts'] == 3 assert lines[['entity_lemma']].value_counts()['sie'] == 273 + + +def test_run_from_cqpy_display(germaparl, query_files): + + corpus = get_corpus(germaparl) + query = cqpy_load(query_files['jemand_sagt_display']) + lines = run_query(corpus, query) + + assert 'word' in lines.columns + assert 'lemma' not in lines.columns + assert 'entity_word' not in lines.columns + assert 'entity_lemma' in lines.columns + assert lines[['entity_lemma']].value_counts()['sie'] == 273 diff --git a/tests/test_09_keywords.py b/tests/test_09_keywords.py index cd912e7..1f2dec8 100644 --- a/tests/test_09_keywords.py +++ b/tests/test_09_keywords.py @@ -89,7 +89,6 @@ def test_keywords_combo(germaparl): assert lines.index[0] == "und KON" -@pytest.mark.now def test_keywords(germaparl): corpus = get_corpus(germaparl) From 7cfdff255546c39de55ef488573424574fd765a8 Mon Sep 17 00:00:00 2001 From: Philipp Heinrich Date: Wed, 23 Feb 2022 22:02:19 +0100 Subject: [PATCH 04/13] test cqp.Ok() --- ccc/cqp.py | 18 ++++++-------- tests/test_02_cl_cqp.py | 55 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+), 11 deletions(-) diff --git a/ccc/cqp.py b/ccc/cqp.py index eec0b69..b172970 100644 --- a/ccc/cqp.py +++ b/ccc/cqp.py @@ -107,7 +107,8 @@ def __init__(self, binary="/usr/local/bin/cqp", options='-c', print_version=Fals version_string = version_string.rstrip() # Equivalent to Perl's chomp self.CQP_process.stdout.flush() if print_version: - logger.info(version_string) + print(version_string) + logger.debug("CQP " + "-" * 43 + " started") version_regexp = re.compile( r'^CQP\s+(?:\w+\s+)*([0-9]+)\.([0-9]+)(?:\.b?([0-9]+))?(?:\s+(.*))?$' ) @@ -157,11 +158,9 @@ def __del__(self): # print "Deleting CQP with pid", self.CQP_process.pid, "...", self.CQPrunning = False self.execStart = time.time() - if self.debug: - logger.info("Shutting down CQP backend ...") + logger.debug("Shutting down CQP backend ...") self.CQP_process.stdin.write('exit;') # exits CQP backend - if self.debug: - logger.info("Done\nCQP object deleted.") + logger.debug("... -- CQP object deleted.") self.execStart = None # print "Finished" @@ -181,8 +180,7 @@ def Exec(self, cmd): self.status = 'ok' cmd = cmd.rstrip() # Equivalent to Perl's 'chomp' cmd = re.sub(r';\s*$', r'', cmd) - if self.debug: - logger.info("CQP <<", cmd + ";") + logger.debug("CQP << " + cmd + ";") try: self.CQP_process.stdin.write(cmd + '; .EOL.;\n') except IOError: @@ -201,11 +199,9 @@ def Exec(self, cmd): ln = self.CQP_process.stdout.readline() ln = ln.strip() # strip off whitespace from start and end of line if re.match(r'-::-EOL-::-', ln): - if self.debug: - print("CQP " + "-" * 60) + logger.debug("CQP " + "-" * 40 + " terminated") break - if self.debug: - print("CQP >> " + ln) + logger.debug("CQP >> " + ln) if ln != '': result.append(ln) self.CQP_process.stdout.flush() diff --git a/tests/test_02_cl_cqp.py b/tests/test_02_cl_cqp.py index 802c4b0..1b63e6a 100644 --- a/tests/test_02_cl_cqp.py +++ b/tests/test_02_cl_cqp.py @@ -3,6 +3,7 @@ from pandas import DataFrame from time import sleep +# import pytest def test_cqp_version(): @@ -123,3 +124,57 @@ def test_cl(germaparl): # first word of 1235th sentence assert(words[s_1234[0]] == "Die") + + +def test_nqr_from_dump_error(germaparl): + cqp = CQP( + binary="cqp", + options='-c -r ' + germaparl['registry_path'] + ) + cqp.Exec(germaparl['corpus_name']) + + # valid dump: + df_dump = DataFrame( + data={ + 'match': [0, 2], + 'matchend': [3, 4] + } + ).set_index(['match', 'matchend']) + cqp.nqr_from_dump(df_dump, name='Valid') + assert cqp.Ok() + assert int(cqp.Exec('size Valid;')) == 2 + + # error type 1: missing values + df_dump = DataFrame( + data={ + 'match': [0, 0], + 'matchend': [10, -1] + } + ).set_index(['match', 'matchend']) + cqp.nqr_from_dump(df_dump, name='Error1') + assert not cqp.Ok() + assert int(cqp.Exec('size Error1;')) == 0 + + # error type 2: match after matchend + df_dump = DataFrame( + data={ + 'match': [0, 10], + 'matchend': [10, 9] + } + ).set_index(['match', 'matchend']) + cqp.nqr_from_dump(df_dump, name='Error2') + assert not cqp.Ok() + assert int(cqp.Exec('size Error2;')) == 0 + + # valid dump: + df_dump = DataFrame( + data={ + 'match': [0, 2], + 'matchend': [3, 4] + } + ).set_index(['match', 'matchend']) + cqp.nqr_from_dump(df_dump, name='Valid') + assert cqp.Ok() + assert int(cqp.Exec('size Valid;')) == 2 + + cqp.__kill__() From d9eec9200a1ade6f141bfe4751f9083cc9e8060c Mon Sep 17 00:00:00 2001 From: Philipp Heinrich Date: Thu, 24 Feb 2022 01:25:29 +0100 Subject: [PATCH 05/13] text constellation concordance --- ccc/discoursemes.py | 98 ++++++++++++++++++------ ccc/version.py | 2 +- tests/test_11_discoursemes.py | 138 +++++++++++++++++++++------------- 3 files changed, 160 insertions(+), 78 deletions(-) diff --git a/ccc/discoursemes.py b/ccc/discoursemes.py index a10968d..7a92948 100644 --- a/ccc/discoursemes.py +++ b/ccc/discoursemes.py @@ -152,22 +152,23 @@ def constellation_left_join(df1, df2, name, drop=True): return m -def constellation_outer_join(df1, df2, name): - """join an additional dump df2 to an existing constellation +def aggregate_matches(df, name, context_col='contextid', + match_cols=['match', 'matchend']): - :param DataFrame df1: textual const. === (ci) nr_1 nr_2 ... == - :param DataFrame df2: additional dump === (m, me) ci c ce == - :param str name: name for additional discourseme - :return: constellation dump incl. additional dump === (ci) nr_1 nr_2 ... nr_name == - :rtype: DataFrame - """ + # counts + counts = DataFrame(df[context_col].value_counts()).astype("Int64") + counts.columns = ['COUNTS_' + name] - # merge dumps via contextid ### - table = DataFrame(df2[['contextid']].value_counts()) - table.columns = [name] - m = df1.join(table, how='outer').astype("Int64") + # matches + matches = df.reset_index() + matches['MATCHES_' + name] = matches[match_cols].values.tolist() + matches['MATCHES_' + name] = matches['MATCHES_' + name].apply(tuple) + matches = matches.groupby('contextid')['MATCHES_' + name].apply(set) - return m + # combine + table = counts.join(matches) + + return table def role_formatter(row, names, s_show, window): @@ -199,9 +200,17 @@ def role_formatter(row, names, s_show, window): # discourseme names for name in names: role = [None] * len(d['offset']) - for t in row[name]: - for i in range(d['cpos'].index(t[1]), d['cpos'].index(t[2]) + 1): - role[i] = name + if not isinstance(row[name], float): + for t in row[name]: + if len(t) == 2: + # lazy definition without offset + start = 0 + end = 1 + else: + start = 1 + end = 2 + for i in range(d['cpos'].index(t[start]), d['cpos'].index(t[end]) + 1): + role[i] = name roles.append(role) # combine individual roles into one list of lists @@ -297,7 +306,7 @@ def concordance(self, window=5, # convert dataframe df_grouped = self.group_lines() - # retrieve concordance lines + # retrieve concordance lines // TODO speed up: first cut-off, then retrieval conc = Concordance(self.corpus.copy(), df_grouped) lines = conc.lines(form='dict', p_show=p_show, s_show=s_show, order=order, cut_off=cut_off) @@ -362,12 +371,11 @@ def __init__(self, dump, s_context, name='topic'): """ self.corpus = dump.corpus - table = DataFrame(dump.df[['contextid']].value_counts()) - table.columns = [name] - self.df = table self.s_context = s_context self.N = len(self.corpus.attributes.attribute(s_context, 's')) + self.df = aggregate_matches(dump.df, name) + def add_discourseme(self, dump, name='discourseme'): # register discourseme @@ -375,15 +383,59 @@ def add_discourseme(self, dump, name='discourseme'): logger.error('name "%s" already taken; cannot register discourseme' % name) return - self.df = constellation_outer_join(self.df, dump.df, name) + df = aggregate_matches(dump.df, name) + df = self.df.join(df, how='outer') + + self.df = df + + def concordance(self, p_show=['word', 'lemma'], s_show=[], + order='random', cut_off=100): + + df = self.df.sample(cut_off) + + # join context..contextend + contexts = self.corpus.dump_from_s_att(self.s_context, annotation=False) + contexts.columns = ['contextid'] + contexts = contexts.reset_index().set_index('contextid') + df = df.join(contexts).set_index(['match', 'matchend']) + + # retrieve concordance lines + conc = Concordance(self.corpus.copy(), df) + lines = conc.lines(form='dict', p_show=p_show, s_show=s_show, + order=order, cut_off=cut_off) + + # get boolean columns for each discourseme + names_bool = list() + for name in [c for c in df.columns if c.startswith("COUNTS_")]: + name_bool = '_'.join(['BOOL', name.split("COUNTS_")[-1]]) + names_bool.append(name_bool) + lines[name_bool] = (lines[name] > 0) + lines[name_bool] = lines[name_bool].fillna(False) + + # format roles + match_cols = [c for c in lines.columns if c.startswith("MATCHES_")] + match_names = [c.split("MATCHES_")[-1] for c in match_cols] + col_mapper = dict(zip(match_cols, match_names)) + lines = lines.rename(columns=col_mapper) + lines = list(lines.apply( + lambda row: role_formatter( + row, match_names, s_show=names_bool+s_show, window=0 + ), axis=1 + )) + + return lines def associations(self, ams=None, frequencies=True, min_freq=2, order='log_likelihood', cut_off=None): + counts = self.df[[c for c in self.df.columns if c.startswith("COUNTS_")]] + counts.columns = [c.split("COUNTS_")[-1] for c in counts.columns] + cooc = counts > 0 + + # TODO obere Dreiecksmatrix tables = DataFrame() - cooc = self.df > 0 - for name in self.df.columns: + for name in counts.columns: table = round(textual_associations( cooc, self.N, name ).reset_index(), 2) diff --git a/ccc/version.py b/ccc/version.py index 1f4c4d4..0fb0597 100644 --- a/ccc/version.py +++ b/ccc/version.py @@ -1 +1 @@ -__version__ = "0.10.1" +__version__ = "0.10.2.dev2" diff --git a/tests/test_11_discoursemes.py b/tests/test_11_discoursemes.py index 3ebc9a4..63235dd 100644 --- a/tests/test_11_discoursemes.py +++ b/tests/test_11_discoursemes.py @@ -264,6 +264,35 @@ def test_create_constellation(germaparl, discoursemes): assert len(df) == 2990 +def test_create_textconstellation(germaparl, discoursemes): + + corpus_name = germaparl['corpus_name'] + + # parameters + parameters = discoursemes.pop('parameters') + flags = parameters['flags_query'] + escape = parameters['escape_query'] + p_query = parameters['p_query'] + s_query = parameters['s_query'] + s_context = parameters['s_context'] + context = parameters['context'] + + # get topic and additional discoursemes + names = list(discoursemes.keys()) + topic_name = names[0] + topic_items = discoursemes.pop(topic_name) + additional_discoursemes = discoursemes + + const = create_constellation(corpus_name, topic_name, topic_items, + p_query, s_query, flags, escape, + s_context, context, + additional_discoursemes, + registry_path=germaparl['registry_path'], + data_path=DATA_PATH, text=True) + + assert len(const.df) == 2198 + + @pytest.mark.mmda def test_mmda(germaparl): @@ -466,7 +495,7 @@ def test_textual_constellation(germaparl, discoursemes): s_context=discoursemes['parameters']['s_context'] ) assert len(const.df) == 624 - assert 'topic' in const.df.columns + assert 'MATCHES_topic' in const.df.columns def test_textual_constellation_add(germaparl, discoursemes): @@ -509,66 +538,67 @@ def test_textual_constellation_add(germaparl, discoursemes): ) assert len(const.df) == 2156 - assert 'discourseme' in const.df.columns + assert 'MATCHES_discourseme' in const.df.columns def test_textual_constellation_association(germaparl, discoursemes): - corpus = get_corpus(germaparl) + corpus_name = germaparl['corpus_name'] - # init constellation - topic_query = format_cqp_query( - discoursemes['items_topic'], - p_query=discoursemes['parameters']['p_query'], - s_query=discoursemes['parameters']['s_query'], - flags=discoursemes['parameters']['flags_query'], - escape=discoursemes['parameters']['escape_query'] - ) - topic_dump = corpus.query( - topic_query, - context=None, - context_break=discoursemes['parameters']['s_context'] - ) - const = TextConstellation( - topic_dump, - s_context=discoursemes['parameters']['s_context'] - ) + # parameters + parameters = discoursemes.pop('parameters') + flags = parameters['flags_query'] + escape = parameters['escape_query'] + p_query = parameters['p_query'] + s_query = parameters['s_query'] + s_context = parameters['s_context'] + context = parameters['context'] - # add discourseme - disc1_query = format_cqp_query( - discoursemes['items_1'], - p_query=discoursemes['parameters']['p_query'], - s_query=discoursemes['parameters']['s_query'], - flags=discoursemes['parameters']['flags_query'], - escape=discoursemes['parameters']['escape_query'] - ) - disc1_dump = corpus.query( - disc1_query, - context=None, - context_break=discoursemes['parameters']['s_context'] - ) - const.add_discourseme( - disc1_dump, - name='disc1' - ) - # add discourseme 2 - disc2_query = format_cqp_query( - discoursemes['items_2'], - p_query=discoursemes['parameters']['p_query'], - s_query=discoursemes['parameters']['s_query'], - flags=discoursemes['parameters']['flags_query'], - escape=discoursemes['parameters']['escape_query'] - ) - disc2_dump = corpus.query( - disc2_query, - context=None, - context_break=discoursemes['parameters']['s_context'] - ) - const.add_discourseme( - disc2_dump, - name='disc2' - ) + # get topic and additional discoursemes + names = list(discoursemes.keys()) + topic_name = names[0] + topic_items = discoursemes.pop(topic_name) + additional_discoursemes = discoursemes + + const = create_constellation(corpus_name, topic_name, topic_items, + p_query, s_query, flags, escape, + s_context, context, + additional_discoursemes, + registry_path=germaparl['registry_path'], + data_path=DATA_PATH, text=True) assoc = const.associations() assert len(assoc) == 6 assert 'candidate' in assoc.columns + + +@pytest.mark.now +def test_textual_constellation_concordance(germaparl, discoursemes): + + corpus_name = germaparl['corpus_name'] + + # parameters + parameters = discoursemes.pop('parameters') + flags = parameters['flags_query'] + escape = parameters['escape_query'] + p_query = parameters['p_query'] + s_query = parameters['s_query'] + s_context = parameters['s_context'] + context = parameters['context'] + + # get topic and additional discoursemes + names = list(discoursemes.keys()) + topic_name = names[0] + topic_items = discoursemes.pop(topic_name) + additional_discoursemes = discoursemes + + const = create_constellation(corpus_name, topic_name, topic_items, + p_query, s_query, flags, escape, + s_context, context, + additional_discoursemes, + registry_path=germaparl['registry_path'], + data_path=DATA_PATH, text=True) + + from pprint import pprint + lines = const.concordance() + pprint(lines[0]) From d2a6891800a420df90ec4c24fdb88571e92ee2bf Mon Sep 17 00:00:00 2001 From: Philipp Heinrich Date: Fri, 25 Feb 2022 00:16:33 +0100 Subject: [PATCH 06/13] constellation concordances --- ccc/discoursemes.py | 225 ++++++++++++++++------ tests/conftest.py | 8 +- tests/test_11_discoursemes.py | 338 +++++++++++++++++----------------- 3 files changed, 349 insertions(+), 222 deletions(-) diff --git a/ccc/discoursemes.py b/ccc/discoursemes.py index 7a92948..62ae5aa 100644 --- a/ccc/discoursemes.py +++ b/ccc/discoursemes.py @@ -104,7 +104,7 @@ ########################################################## -def constellation_left_join(df1, df2, name, drop=True): +def constellation_left_join(df1, df2, name, drop=True, window=None): """join an additional dump df2 to an existing constellation :param DataFrame df1: constellation dump === (m, me) ci c ce m_t* me_t* o_t* m_1* me_1* o_1* ... == @@ -132,10 +132,13 @@ def constellation_left_join(df1, df2, name, drop=True): # restrict to complete constellation ### if drop: m = m.dropna() - # only keep co-occurrences that are within context - m = m.loc[ - (m['matchend_y'] >= m['context']) & (m['match_y'] < m['contextend']) - ] + if window is None: + # only keep co-occurrences that are within context + m = m.loc[ + (m['matchend_y'] >= m['context']) & (m['match_y'] < m['contextend']) + ] + else: + m = m.loc[abs(m['offset_y']) <= window] # rename columns ### m = m.rename(columns={ @@ -187,6 +190,7 @@ def role_formatter(row, names, s_show, window): """ + # TODO directly create relevant objects, no need for frontend to take care of it # init d = row['dict'] roles = list() @@ -199,24 +203,44 @@ def role_formatter(row, names, s_show, window): # discourseme names for name in names: + role = [None] * len(d['offset']) + if not isinstance(row[name], float): for t in row[name]: + + # check match information if len(t) == 2: # lazy definition without offset start = 0 end = 1 - else: + elif len(t) == 3: + # with offset start = 1 end = 2 - for i in range(d['cpos'].index(t[start]), d['cpos'].index(t[end]) + 1): + else: + continue + + # skip NAs + if not isinstance(t[start], int): + continue + + # skip the ones too far away + try: + start = d['cpos'].index(t[start]) + end = d['cpos'].index(t[end]) + 1 + except ValueError: + continue + + for i in range(start, end): role[i] = name + roles.append(role) # combine individual roles into one list of lists d['role'] = [[a for a in set(r) if a is not None] for r in list(zip(*roles))] - # append s-attributes + # add s-attributes for s in s_show: d[s] = row[s] @@ -253,7 +277,7 @@ def __repr__(self): """ return self.__str__() - def add_discourseme(self, dump, name='discourseme', drop=True): + def add_discourseme(self, dump, name='discourseme', drop=True, window=None): """ :param Dump dump: dump.df: == (m, me) ci == :param str name: name of the discourseme @@ -266,7 +290,8 @@ def add_discourseme(self, dump, name='discourseme', drop=True): return self.discoursemes[name] = dump - self.df = constellation_left_join(self.df, dump.df, name, drop=drop) + self.df = constellation_left_join(self.df, dump.df, name, + drop=drop, window=window) def group_lines(self): """ @@ -291,11 +316,12 @@ def group_lines(self): df[name] = df[name].apply(tuple) df = df.drop(columns, axis=1) df_reduced[name] = df.groupby(level=['match', 'matchend'])[name].apply(set) + return df_reduced def concordance(self, window=5, p_show=['word', 'lemma'], s_show=[], - order='random', cut_off=100): + order='random', cut_off=100, random_seed=42): """Retrieve concordance lines for constellation. :param int window: cpos further away from node will be marked 'out_of_window' @@ -305,11 +331,24 @@ def concordance(self, window=5, """ # convert dataframe - df_grouped = self.group_lines() - # retrieve concordance lines // TODO speed up: first cut-off, then retrieval - conc = Concordance(self.corpus.copy(), df_grouped) + df = self.group_lines() + + # cut off and sampling + cut_off = len(df) if cut_off is None or cut_off > len(df) else cut_off + if order == 'random': + df = df.sample(cut_off, random_state=random_seed) + elif order == 'first': + df = df.head(cut_off) + elif order == 'last': + df = df.head(cut_off) + else: + raise NotImplementedError + + # retrieve concordance lines + conc = Concordance(self.corpus.copy(), df) lines = conc.lines(form='dict', p_show=p_show, s_show=s_show, order=order, cut_off=cut_off) + # map roles output = list(lines.apply( lambda row: role_formatter( @@ -386,12 +425,22 @@ def add_discourseme(self, dump, name='discourseme'): df = aggregate_matches(dump.df, name) df = self.df.join(df, how='outer') - self.df = df - - def concordance(self, p_show=['word', 'lemma'], s_show=[], - order='random', cut_off=100): + self.df = df.sort_index() - df = self.df.sample(cut_off) + def concordance(self, window=0, + p_show=['word', 'lemma'], s_show=[], + order='random', cut_off=100, random_seed=42): + + # cut off and sampling + cut_off = len(self.df) if cut_off is None or cut_off > len(self.df) else cut_off + if order == 'random': + df = self.df.sample(cut_off, random_state=random_seed) + elif order == 'first': + df = self.df.head(cut_off) + elif order == 'last': + df = self.df.head(cut_off) + else: + raise NotImplementedError # join context..contextend contexts = self.corpus.dump_from_s_att(self.s_context, annotation=False) @@ -419,7 +468,7 @@ def concordance(self, p_show=['word', 'lemma'], s_show=[], lines = lines.rename(columns=col_mapper) lines = list(lines.apply( lambda row: role_formatter( - row, match_names, s_show=names_bool+s_show, window=0 + row, match_names, s_show=names_bool+s_show, window=window ), axis=1 )) @@ -433,7 +482,7 @@ def associations(self, ams=None, frequencies=True, counts.columns = [c.split("COUNTS_")[-1] for c in counts.columns] cooc = counts > 0 - # TODO obere Dreiecksmatrix + # TODO triangular matrix tables = DataFrame() for name in counts.columns: table = round(textual_associations( @@ -467,7 +516,6 @@ def textual_associations(cooc, N, column): 'f': f, 'N': N }) - contingencies = DataFrame(records).set_index('candidate') measures = calculate_measures(contingencies, freq=True) contingencies = contingencies.join(measures) @@ -476,52 +524,121 @@ def textual_associations(cooc, N, column): def create_constellation(corpus_name, - topic_name, topic_items, - p_query, s_query, flags, escape, - s_context, context, + # discoursemes + topic_discourseme, + filter_discoursemes, additional_discoursemes, - lib_path=None, cqp_bin='cqp', + # context settings + s_context, + context=20, + # query settings + p_query='word', + s_query=None, + flags='%cd', + escape=True, + match_strategy='longest', + # CWB settings + lib_path=None, + cqp_bin='cqp', registry_path='/usr/local/share/cwb/registry/', data_path='/tmp/ccc-data/', - match_strategy='longest', - dataframe=False, drop=True, text=False): - """ - simple constellation creator + window=None): + """simple constellation creator. returns a Constellation() if a + topic_discourseme is given, otherwise a TextConstellation(). Note + that for TextConstellations, there is no difference between + additional discoursemes and filter discoursemes (instead, a + boolean column for each discourseme is added). + + :param dict topic_discourseme: used for init + :param dict filter_discoursemes: inner join + :param dict additional_discourseme: left join + """ + # pre-process parameters + s_context = s_query if not s_context else s_context + s_query = s_context if s_query is None else s_query + # init corpus corpus = Corpus(corpus_name, lib_path, cqp_bin, registry_path, data_path) - # init discourseme constellation - topic_query = format_cqp_query( - topic_items, p_query=p_query, s_query=s_query, flags=flags, escape=escape - ) - topic_dump = corpus.query( - topic_query, context=context, context_break=s_context, - match_strategy=match_strategy - ) + # topic -> Constellation() + if len(topic_discourseme) > 0: + + if len(topic_discourseme) > 1: + raise ValueError("only one topic discourseme can be given") - if not text: + # init with topic + topic_name = list(topic_discourseme.keys())[0] + topic_items = topic_discourseme[topic_name] + topic_query = format_cqp_query( + topic_items, p_query=p_query, s_query=s_query, flags=flags, escape=escape + ) + topic_dump = corpus.query( + topic_query, + context=context, + context_break=s_context, + match_strategy=match_strategy + ) const = Constellation(topic_dump, topic_name) + + # add filter discoursemes + for disc_name, disc_items in filter_discoursemes.items(): + disc_query = format_cqp_query( + disc_items, p_query=p_query, s_query=s_query, flags=flags, escape=escape + ) + disc_dump = corpus.query( + disc_query, + context=None, + context_break=s_context, + match_strategy=match_strategy + ) + const.add_discourseme(disc_dump, disc_name, drop=True, window=window) + + # add additional discoursemes + for disc_name, disc_items in additional_discoursemes.items(): + disc_query = format_cqp_query( + disc_items, p_query=p_query, s_query=s_query, flags=flags, escape=escape + ) + disc_dump = corpus.query( + disc_query, + context=None, + context_break=s_context, + match_strategy=match_strategy + ) + const.add_discourseme(disc_dump, disc_name, drop=False) + + # no topic -> TextConstellation() else: - const = TextConstellation(topic_dump, s_context, topic_name) - # add further discoursemes - for disc_name in additional_discoursemes.keys(): - disc_items = additional_discoursemes[disc_name] - disc_query = format_cqp_query( - disc_items, p_query=p_query, s_query=s_query, flags=flags, escape=escape + # no filter implemented: all discoursemes are equal + discoursemes = {**filter_discoursemes, **additional_discoursemes} + + # init with arbitrary topic + topic_name = list(discoursemes.keys())[0] + topic_items = discoursemes.pop(topic_name) + topic_query = format_cqp_query( + topic_items, p_query=p_query, s_query=s_query, flags=flags, escape=escape ) - disc_dump = corpus.query( - disc_query, context=None, context_break=s_context, + topic_dump = corpus.query( + topic_query, + context=context, + context_break=s_context, match_strategy=match_strategy ) - if not text: - const.add_discourseme(disc_dump, disc_name, drop=drop) - else: + const = TextConstellation(topic_dump, s_context, topic_name) + + # add further discoursemes + for disc_name, disc_items in discoursemes.items(): + disc_query = format_cqp_query( + disc_items, p_query=p_query, s_query=s_query, flags=flags, escape=escape + ) + disc_dump = corpus.query( + disc_query, + context=None, + context_break=s_context, + match_strategy=match_strategy + ) const.add_discourseme(disc_dump, disc_name) - if dataframe: - return const.df - else: - return const + return const diff --git a/tests/conftest.py b/tests/conftest.py index 6ee0b42..e693605 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -87,16 +87,16 @@ def discoursemes(): """ discoursemes """ return { - 'items_topic': ["CDU", "CSU"], - 'items_1': ["und"], - 'items_2': ["Bundesregierung"], + 'topic': ["CDU", "CSU"], + 'disc1': ["und"], + 'disc2': ["Bundesregierung"], 'parameters': { 'flags_query': '%cd', 'escape_query': False, 'p_query': 'lemma', 's_query': 's', 's_context': 'p', - 'context': 20, + 'context': 20 } } diff --git a/tests/test_11_discoursemes.py b/tests/test_11_discoursemes.py index 63235dd..1a31341 100644 --- a/tests/test_11_discoursemes.py +++ b/tests/test_11_discoursemes.py @@ -7,6 +7,7 @@ from pandas import DataFrame import pytest + ####################### # ccc.discoursemes #### ####################### @@ -48,17 +49,17 @@ def get_corpus(corpus_settings, data_path=DATA_PATH): data_path=data_path ) -############ -# CREATION # -############ -@pytest.mark.discourseme + +################# +# CONSTELLATION # +################# def test_constellation_init(germaparl, discoursemes): corpus = get_corpus(germaparl) # init constellation topic_query = format_cqp_query( - discoursemes['items_topic'], + discoursemes['topic'], p_query=discoursemes['parameters']['p_query'], s_query=discoursemes['parameters']['s_query'], flags=discoursemes['parameters']['flags_query'], @@ -75,14 +76,13 @@ def test_constellation_init(germaparl, discoursemes): assert len(const.df) == 2777 -@pytest.mark.discourseme def test_constellation_add(germaparl, discoursemes): corpus = get_corpus(germaparl) # init constellation topic_query = format_cqp_query( - discoursemes['items_topic'], + discoursemes['topic'], p_query=discoursemes['parameters']['p_query'], s_query=discoursemes['parameters']['s_query'], flags=discoursemes['parameters']['flags_query'], @@ -97,7 +97,7 @@ def test_constellation_add(germaparl, discoursemes): # add discourseme disc1_query = format_cqp_query( - discoursemes['items_1'], + discoursemes['disc1'], p_query=discoursemes['parameters']['p_query'], s_query=discoursemes['parameters']['s_query'], flags=discoursemes['parameters']['flags_query'], @@ -115,14 +115,13 @@ def test_constellation_add(germaparl, discoursemes): assert len(const.discoursemes) == 2 -@pytest.mark.discourseme def test_constellation_add_nodrop(germaparl, discoursemes): corpus = get_corpus(germaparl) # init constellation topic_query = format_cqp_query( - discoursemes['items_topic'], + discoursemes['topic'], p_query=discoursemes['parameters']['p_query'], s_query=discoursemes['parameters']['s_query'], flags=discoursemes['parameters']['flags_query'], @@ -138,7 +137,7 @@ def test_constellation_add_nodrop(germaparl, discoursemes): # add discourseme disc1_query = format_cqp_query( - discoursemes['items_1'], + discoursemes['disc1'], p_query=discoursemes['parameters']['p_query'], s_query=discoursemes['parameters']['s_query'], flags=discoursemes['parameters']['flags_query'], @@ -156,14 +155,13 @@ def test_constellation_add_nodrop(germaparl, discoursemes): assert len(const.df) == 3060 -@pytest.mark.discourseme def test_constellation_add2(germaparl, discoursemes): corpus = get_corpus(germaparl) # init constellation topic_query = format_cqp_query( - discoursemes['items_topic'], + discoursemes['topic'], p_query=discoursemes['parameters']['p_query'], s_query=discoursemes['parameters']['s_query'], flags=discoursemes['parameters']['flags_query'], @@ -178,7 +176,7 @@ def test_constellation_add2(germaparl, discoursemes): # add discourseme 1 disc1_query = format_cqp_query( - discoursemes['items_1'], + discoursemes['disc1'], p_query=discoursemes['parameters']['p_query'], s_query=discoursemes['parameters']['s_query'], flags=discoursemes['parameters']['flags_query'], @@ -196,7 +194,7 @@ def test_constellation_add2(germaparl, discoursemes): # add discourseme 2 disc2_query = format_cqp_query( - discoursemes['items_2'], + discoursemes['disc2'], p_query=discoursemes['parameters']['p_query'], s_query=discoursemes['parameters']['s_query'], flags=discoursemes['parameters']['flags_query'], @@ -215,6 +213,9 @@ def test_constellation_add2(germaparl, discoursemes): assert len(const.df) == 13 +######################## +# CREATE_CONSTELLATION # +######################## def test_create_constellation(germaparl, discoursemes): corpus_name = germaparl['corpus_name'] @@ -229,39 +230,51 @@ def test_create_constellation(germaparl, discoursemes): context = parameters['context'] # get topic and additional discoursemes - names = list(discoursemes.keys()) - topic_name = names[0] - topic_items = discoursemes.pop(topic_name) - additional_discoursemes = discoursemes - - const = create_constellation(corpus_name, topic_name, topic_items, - p_query, s_query, flags, escape, - s_context, context, - additional_discoursemes, + topic_items = discoursemes.pop('topic') + topic_discourseme = { + 'topic': topic_items + } + discoursemes = discoursemes + + # filter + const = create_constellation(corpus_name, + # discoursemes + topic_discourseme, + discoursemes, + {}, + # context settings + s_context, + context, + # query settings + p_query, + s_query, + flags, + escape, + # CWB setttings registry_path=germaparl['registry_path'], data_path=DATA_PATH) assert len(const.df) == 10 - df = create_constellation(corpus_name, topic_name, topic_items, - p_query, s_query, flags, escape, - s_context, context, - additional_discoursemes, - dataframe=True, - registry_path=germaparl['registry_path'], - data_path=DATA_PATH) - - assert len(df) == 10 - - df = create_constellation(corpus_name, topic_name, topic_items, - p_query, s_query, flags, escape, - s_context, context, - additional_discoursemes, - registry_path=germaparl['registry_path'], - data_path=DATA_PATH, - dataframe=True, drop=False) + # highlight + const = create_constellation(corpus_name, + # discoursemes + topic_discourseme, + {}, + discoursemes, + # context settings + s_context, + context, + # query settings + p_query, + s_query, + flags, + escape, + # CWB setttings + registry_path=germaparl['registry_path'], + data_path=DATA_PATH) - assert len(df) == 2990 + assert len(const.df) == 2990 def test_create_textconstellation(germaparl, discoursemes): @@ -277,24 +290,72 @@ def test_create_textconstellation(germaparl, discoursemes): s_context = parameters['s_context'] context = parameters['context'] - # get topic and additional discoursemes - names = list(discoursemes.keys()) - topic_name = names[0] - topic_items = discoursemes.pop(topic_name) - additional_discoursemes = discoursemes - - const = create_constellation(corpus_name, topic_name, topic_items, - p_query, s_query, flags, escape, - s_context, context, - additional_discoursemes, + # create constellation + const = create_constellation(corpus_name, + # discoursemes + {}, + {}, + discoursemes, + # context settings + s_context, + context, + # query settings + p_query, + s_query, + flags, + escape, + # CWB setttings registry_path=germaparl['registry_path'], - data_path=DATA_PATH, text=True) + data_path=DATA_PATH) assert len(const.df) == 2198 -@pytest.mark.mmda -def test_mmda(germaparl): +############### +# CONCORDANCE # +############### +def test_constellation_conc(germaparl, discoursemes): + + # parameters + parameters = discoursemes.pop('parameters') + + # get topic and additional discoursemes + topic_items = discoursemes.pop('topic') + topic_discourseme = { + 'topic': topic_items + } + discoursemes = discoursemes + + # filter + const = create_constellation(germaparl['corpus_name'], + # discoursemes + topic_discourseme, + discoursemes, + {}, + # context settings + parameters['s_context'], + parameters['context'], + # query settings + parameters['p_query'], + parameters['s_query'], + parameters['flags_query'], + parameters['escape_query'], + # CWB setttings + registry_path=germaparl['registry_path'], + data_path=DATA_PATH) + + lines = const.concordance(s_show=['text_id']) + + assert len(lines) == 3 + assert isinstance(lines[0], dict) + assert 'word' in lines[0] + assert isinstance(lines[0]['word'], list) + + +############### +# COLLOCATION # +############### +def test_constellation_collocates(germaparl): topic_name = 'topic' topic_items = ['CDU', 'CSU'] @@ -316,19 +377,29 @@ def test_mmda(germaparl): order = 'log_likelihood' escape = True frequencies = True + match_strategy = 'longest' # preprocess parameters s_query = s_context if s_query is None else s_query topic_name = 'topic' # create constellation - const = create_constellation(germaparl['corpus_name'], - topic_name, topic_items, - p_query, s_query, flags_query, escape, - s_context, context, - additional_discoursemes, - lib_path, cqp_bin, - germaparl['registry_path']) + const = create_constellation( + germaparl['corpus_name'], + {topic_name: topic_items}, + {}, + additional_discoursemes, + s_context, + context, + p_query, + s_query, + flags_query, + escape, + match_strategy, + lib_path, + cqp_bin, + germaparl['registry_path'] + ) collocates = const.collocates(windows=windows, p_show=p_show, flags=flags_show, @@ -338,84 +409,13 @@ def test_mmda(germaparl): assert len(collocates) == 3 -############### -# CONCORDANCE # -############### -@pytest.mark.discourseme -def test_constellation_conc(germaparl, discoursemes): - - corpus = get_corpus(germaparl) - - # init constellation - topic_query = format_cqp_query( - discoursemes['items_topic'], - p_query=discoursemes['parameters']['p_query'], - s_query=discoursemes['parameters']['s_query'], - flags=discoursemes['parameters']['flags_query'], - escape=discoursemes['parameters']['escape_query'] - ) - topic_dump = corpus.query( - topic_query, - context=None, - context_break=discoursemes['parameters']['s_context'] - ) - const = Constellation(topic_dump) - - # add discourseme 1 - disc1_query = format_cqp_query( - discoursemes['items_1'], - p_query=discoursemes['parameters']['p_query'], - s_query=discoursemes['parameters']['s_query'], - flags=discoursemes['parameters']['flags_query'], - escape=discoursemes['parameters']['escape_query'] - ) - disc1_dump = corpus.query( - disc1_query, - context=None, - context_break=discoursemes['parameters']['s_context'] - ) - const.add_discourseme( - disc1_dump, - name='disc1' - ) - - # add discourseme 2 - disc2_query = format_cqp_query( - discoursemes['items_2'], - p_query=discoursemes['parameters']['p_query'], - s_query=discoursemes['parameters']['s_query'], - flags=discoursemes['parameters']['flags_query'], - escape=discoursemes['parameters']['escape_query'] - ) - disc2_dump = corpus.query( - disc2_query, - context=None, - context_break=discoursemes['parameters']['s_context'] - ) - const.add_discourseme( - disc2_dump, - name='disc2' - ) - - lines = const.concordance(s_show=['text_id']) - - assert len(lines) == 5 - assert isinstance(lines[0], dict) - assert 'word' in lines[0] - assert isinstance(lines[0]['word'], list) - - -############### -# COLLOCATION # -############### -@pytest.mark.discourseme def test_constellation_coll(germaparl, discoursemes): corpus = get_corpus(germaparl) # init constellation topic_query = format_cqp_query( - discoursemes['items_topic'], + discoursemes['topic'], p_query=discoursemes['parameters']['p_query'], s_query=discoursemes['parameters']['s_query'], flags=discoursemes['parameters']['flags_query'], @@ -430,7 +430,7 @@ def test_constellation_coll(germaparl, discoursemes): # add discourseme 1 disc1_query = format_cqp_query( - discoursemes['items_1'], + discoursemes['disc1'], p_query=discoursemes['parameters']['p_query'], s_query=discoursemes['parameters']['s_query'], flags=discoursemes['parameters']['flags_query'], @@ -448,7 +448,7 @@ def test_constellation_coll(germaparl, discoursemes): # add discourseme 2 disc2_query = format_cqp_query( - discoursemes['items_2'], + discoursemes['disc2'], p_query=discoursemes['parameters']['p_query'], s_query=discoursemes['parameters']['s_query'], flags=discoursemes['parameters']['flags_query'], @@ -479,7 +479,7 @@ def test_textual_constellation(germaparl, discoursemes): # init constellation topic_query = format_cqp_query( - discoursemes['items_topic'], + discoursemes['topic'], p_query=discoursemes['parameters']['p_query'], s_query=discoursemes['parameters']['s_query'], flags=discoursemes['parameters']['flags_query'], @@ -504,7 +504,7 @@ def test_textual_constellation_add(germaparl, discoursemes): # init constellation topic_query = format_cqp_query( - discoursemes['items_topic'], + discoursemes['topic'], p_query=discoursemes['parameters']['p_query'], s_query=discoursemes['parameters']['s_query'], flags=discoursemes['parameters']['flags_query'], @@ -522,7 +522,7 @@ def test_textual_constellation_add(germaparl, discoursemes): # add discourseme disc1_query = format_cqp_query( - discoursemes['items_1'], + discoursemes['disc1'], p_query=discoursemes['parameters']['p_query'], s_query=discoursemes['parameters']['s_query'], flags=discoursemes['parameters']['flags_query'], @@ -554,18 +554,22 @@ def test_textual_constellation_association(germaparl, discoursemes): s_context = parameters['s_context'] context = parameters['context'] - # get topic and additional discoursemes - names = list(discoursemes.keys()) - topic_name = names[0] - topic_items = discoursemes.pop(topic_name) - additional_discoursemes = discoursemes - - const = create_constellation(corpus_name, topic_name, topic_items, - p_query, s_query, flags, escape, - s_context, context, - additional_discoursemes, + const = create_constellation(corpus_name, + # discoursemes + {}, + discoursemes, + {}, + # context settings + s_context, + context, + # query settings + p_query, + s_query, + flags, + escape, + # CWB setttings registry_path=germaparl['registry_path'], - data_path=DATA_PATH, text=True) + data_path=DATA_PATH) assoc = const.associations() assert len(assoc) == 6 @@ -586,19 +590,25 @@ def test_textual_constellation_concordance(germaparl, discoursemes): s_context = parameters['s_context'] context = parameters['context'] - # get topic and additional discoursemes - names = list(discoursemes.keys()) - topic_name = names[0] - topic_items = discoursemes.pop(topic_name) - additional_discoursemes = discoursemes - - const = create_constellation(corpus_name, topic_name, topic_items, - p_query, s_query, flags, escape, - s_context, context, - additional_discoursemes, + # create constellation + const = create_constellation(corpus_name, + # discoursemes + {}, + discoursemes, + {}, + # context settings + s_context, + context, + # query settings + p_query, + s_query, + flags, + escape, + # CWB setttings registry_path=germaparl['registry_path'], - data_path=DATA_PATH, text=True) + data_path=DATA_PATH) + + # retrieve lines + lines = const.concordance(cut_off=None) - from pprint import pprint - lines = const.concordance() - pprint(lines[0]) + assert len(lines) == 2198 From 926bb0b7ac37cb6b05965250dffe390166a133b3 Mon Sep 17 00:00:00 2001 From: Philipp Heinrich Date: Mon, 28 Feb 2022 18:22:09 +0100 Subject: [PATCH 07/13] flags in breakdown --- ccc/dumps.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/ccc/dumps.py b/ccc/dumps.py index 1d8f5db..e4e851b 100644 --- a/ccc/dumps.py +++ b/ccc/dumps.py @@ -10,7 +10,7 @@ from .collocates import Collocates from .concordances import Concordance from .keywords import Keywords -from .utils import correct_anchors, merge_intervals +from .utils import correct_anchors, merge_intervals, fold_df logger = logging.getLogger(__name__) @@ -71,18 +71,22 @@ def correct_anchors(self, corrections): """ self.df = correct_anchors(self.df, corrections) - def breakdown(self, p_atts=['word']): + def breakdown(self, p_atts=['word'], flags=""): """Frequency breakdown of match..matchend. """ logger.info('creating frequency breakdown') - return self.corpus.counts.dump( + breakdown = self.corpus.counts.dump( df_dump=self.df, start='match', end='matchend', p_atts=p_atts, strategy=1 ) + breakdown = fold_df(breakdown, flags) + + return breakdown + def matches(self): """ :return: cpos of (match .. matchend) regions From c41449b6530a711673e015561b584b54af52aca7 Mon Sep 17 00:00:00 2001 From: Philipp Heinrich Date: Sat, 5 Mar 2022 20:01:20 +0100 Subject: [PATCH 08/13] add method for deleting cache entries --- ccc/cache.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/ccc/cache.py b/ccc/cache.py index 7596409..64e8754 100644 --- a/ccc/cache.py +++ b/ccc/cache.py @@ -38,11 +38,27 @@ def __init__(self, path=None): directory = os.path.dirname(path) os.makedirs(directory, exist_ok=True) + def delete(self, identifier): + + if self.path is None: + logger.info('cache: no path') + return + + if isinstance(identifier, str): + key = identifier + else: + key = generate_idx(identifier) + + with shelve.open(self.path) as db: + if key in db.keys(): + logger.info('cache: deleting object "%s"' % key) + del db[key] + def get(self, identifier): if self.path is None: logger.info('cache: no path') - return None + return if isinstance(identifier, str): key = identifier From eda314fad0fa070c1057300e14a33f2f39e2dd5b Mon Sep 17 00:00:00 2001 From: Philipp Heinrich Date: Sat, 5 Mar 2022 20:02:17 +0100 Subject: [PATCH 09/13] return empty DataFrame if query is invalid --- ccc/cqp.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ccc/cqp.py b/ccc/cqp.py index b172970..5b1ed5b 100644 --- a/ccc/cqp.py +++ b/ccc/cqp.py @@ -413,8 +413,14 @@ def nqr_from_query(self, query, name='Last', """ name = 'Last' if name is None else name + logger.info('defining NQR "%s" from query' % name) self.Query('%s=%s;' % (name, query)) + + if not self.Ok(): + logger.error('invalid query "%s"' % query) + return DataFrame() + size = int(self.Exec("size %s" % name)) if size == 0: return DataFrame() From d723310326249bc976671cf896c9489253110a1e Mon Sep 17 00:00:00 2001 From: Philipp Heinrich Date: Sat, 5 Mar 2022 20:06:48 +0100 Subject: [PATCH 10/13] add pyproject.toml specifying setuptools and wheel --- pyproject.toml | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 pyproject.toml diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..b0471b7 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,3 @@ +[build-system] +requires = ["setuptools", "wheel"] +build-backend = "setuptools.build_meta:__legacy__" \ No newline at end of file From 9de8ec5b2148dd3f3449ffa30fdd6082603b3119 Mon Sep 17 00:00:00 2001 From: Philipp Heinrich Date: Sat, 5 Mar 2022 20:25:16 +0100 Subject: [PATCH 11/13] bugfix in test --- tests/test_10_dumps.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_10_dumps.py b/tests/test_10_dumps.py index 8a64d60..cb420a8 100644 --- a/tests/test_10_dumps.py +++ b/tests/test_10_dumps.py @@ -1,6 +1,7 @@ from ccc.cwb import Corpus from ccc.dumps import Dumps from pandas import DataFrame +import pytest from .conftest import DATA_PATH @@ -63,7 +64,7 @@ def test_matches(germaparl): def test_matches_subcorpus(germaparl): corpus = get_corpus(germaparl) - dump_base = corpus.query(r'[pos="NE"]? [pos="NE"] "\[" ".*" "]")', name="Base") + dump_base = corpus.query(r'[pos="NE"]? [pos="NE"] "\[" ".*" "\]"', name="Base") tokens_base = len(dump_base.matches()) corpus.subcorpus = "Base" dump_neg = corpus.query('[pos="NE"]') From 9a7897fef16cd0f56cb3c74a3c71ee9fbab295de Mon Sep 17 00:00:00 2001 From: Philipp Heinrich Date: Sun, 6 Mar 2022 02:24:44 +0100 Subject: [PATCH 12/13] show wordlists; cqp init: execute macros for real --- ccc/cwb.py | 38 ++++++++++++++++++++++++++++++++------ tests/test_04_cwb.py | 19 ++++++++++++++----- 2 files changed, 46 insertions(+), 11 deletions(-) diff --git a/ccc/cwb.py b/ccc/cwb.py index 476f1c9..fc03204 100644 --- a/ccc/cwb.py +++ b/ccc/cwb.py @@ -48,10 +48,6 @@ def start_cqp(cqp_bin, registry_path, if data_path is not None: cqp.Exec('set DataDirectory "%s"' % data_path) - if corpus_name is not None: - cqp.Exec(corpus_name) - if subcorpus is not None: - cqp.Exec(subcorpus) if lib_path is not None: @@ -69,10 +65,20 @@ def start_cqp(cqp_bin, registry_path, abs_path = os.path.abspath(macro) cqp_exec = 'define macro < "%s";' % abs_path cqp.Exec(cqp_exec) - # execute each macro once (avoids CQP shortcoming for nested macros) + # for wordlists defined in macros, it is necessary to execute the macro once macros = cqp.Exec("show macro;").split("\n") for macro in macros: - cqp.Exec(macro) + cqp.Exec(macro.split("(")[0] + "();") + # NB: this yields !cqp.Ok() if macro is not zero-valent + + # initialize corpus after macro definition, so execution of macro doesn't spend time + if corpus_name is not None: + cqp.Exec(corpus_name) + if subcorpus is not None: + cqp.Exec(subcorpus) + + if not cqp.Ok(): + raise NotImplementedError() return cqp @@ -283,11 +289,31 @@ def _macros_available(self): :rtype: list """ + cqp = self.start_cqp() defined_macros = cqp.Exec("show macro;").split("\n") cqp.__kill__() + return defined_macros + def _wordlists_available(self): + """Get available wordlists. + + :return: defined wordlists + :rtype: list + + """ + + cqp = self.start_cqp() + defined_wordlists = cqp.Exec("show var;").split("\n") + cqp.__kill__() + + names = sorted( + [n.rstrip(" =") for n in defined_wordlists if n.startswith("$") and n.endswith(" =")] + ) + + return names + def _attributes_available(self): """Get indexed p- and s-attributes. Will be run once when initializing the corpus. diff --git a/tests/test_04_cwb.py b/tests/test_04_cwb.py index f0ebb21..0299309 100644 --- a/tests/test_04_cwb.py +++ b/tests/test_04_cwb.py @@ -51,7 +51,17 @@ def test_corpus_descriptor(germaparl): @pytest.mark.init -def test_corpus_lib(germaparl): +def test_data_dir(germaparl): + get_corpus(germaparl, lib=False, data_path=None) + paths = glob("/tmp/ccc-*") + assert len(paths) >= 1 + + +##################################################### +# MACROS AND WORDLISTS ############################## +##################################################### +@pytest.mark.init +def test_macros(germaparl): corpus = get_corpus(germaparl, lib=False) assert '/np(0)' not in corpus._macros_available() corpus = get_corpus(germaparl, lib=True) @@ -59,10 +69,9 @@ def test_corpus_lib(germaparl): @pytest.mark.init -def test_data_dir(germaparl): - get_corpus(germaparl, lib=False, data_path=None) - paths = glob("/tmp/ccc-*") - assert len(paths) >= 1 +def test_wordlists(germaparl): + corpus = get_corpus(germaparl, lib=True) + assert "$parties" in corpus._wordlists_available() ##################################################### From a8989fc3a0b12eef3fda654b7e4e97894edf6455 Mon Sep 17 00:00:00 2001 From: Philipp Heinrich Date: Sun, 6 Mar 2022 02:51:48 +0100 Subject: [PATCH 13/13] bump version --- ccc/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ccc/version.py b/ccc/version.py index 0fb0597..17c1a62 100644 --- a/ccc/version.py +++ b/ccc/version.py @@ -1 +1 @@ -__version__ = "0.10.2.dev2" +__version__ = "0.10.2"