Skip to content

Commit

Permalink
Merge pull request #52 from ausgerechnet/v0.11.2
Browse files Browse the repository at this point in the history
v0.11.2
  • Loading branch information
ausgerechnet authored Nov 25, 2022
2 parents 80d758e + 8523bbe commit b5f67f7
Show file tree
Hide file tree
Showing 11 changed files with 794 additions and 526 deletions.
10 changes: 10 additions & 0 deletions CITATION.cff
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
cff-version: 1.2.0
message: "If you use this software, please cite it as below."
authors:
- family-names: "Heinrich"
given-names: "Philipp"
orcid: "https://orcid.org/0000-0002-4785-9205"
title: "cwb-ccc"
version: 0.11.2
date-released: 2022-11-25
url: "https://github.com/ausgerechnet/cwb-ccc"
2 changes: 1 addition & 1 deletion Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ verify_ssl = true

[dev-packages]
cython = "==0.29.30"
pytest = "==7.0.1"
pytest = "==7.2.0"
pylint = "==2.13.9"
pytest-cov = "==3.0.0"
tabulate = "==0.8.9"
Expand Down
446 changes: 221 additions & 225 deletions Pipfile.lock

Large diffs are not rendered by default.

5 changes: 4 additions & 1 deletion ccc/concordances.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
"""
import itertools
import logging
from random import sample
from random import sample, seed

# requirements
from pandas import DataFrame
Expand Down Expand Up @@ -287,6 +287,9 @@ def lines(self, form='simple', p_show=['word'], s_show=[],
if (cut_off is None) or (len(matches) < cut_off):
cut_off = len(matches)
# order
if isinstance(order, int):
seed(order)
order = 'random'
if order == 'random':
matches = sample(list(matches), cut_off)
elif order == 'first':
Expand Down
184 changes: 181 additions & 3 deletions ccc/cwb.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,21 @@
# part of module
from .cache import Cache, generate_idx, generate_library_idx
from .cl import Corpus as Attributes
from .concordances import Concordance
from .counts import Counts, cwb_scan_corpus
from .cqp import CQP
from .dumps import Dump
from .utils import chunk_anchors, correct_anchors, preprocess_query
from .utils import (chunk_anchors, correct_anchors, dump_left_join,
format_roles, group_lines, preprocess_query, aggregate_matches)
from .version import __version__

logger = logging.getLogger(__name__)


def decode(text):
"""savely decode a string catching common errors
"""
try:
text = text.decode('utf-8')
except (UnicodeDecodeError, AttributeError):
Expand Down Expand Up @@ -1101,8 +1106,7 @@ def query_cqp(self, cqp_query, context=20, context_left=None,
)

# if dump has been retrieved from cache, NQR might not exist
if self.show_nqr().empty or \
name not in self.show_nqr()['subcorpus'].values:
if save and (self.show_nqr().empty or name not in self.show_nqr()['subcorpus'].values):
# undump the dump and save to disk
cqp = self.start_cqp()
cqp.nqr_from_dump(df_dump, name)
Expand Down Expand Up @@ -1160,3 +1164,177 @@ def query(self, cqp_query=None, context=20, context_left=None,

else:
raise NotImplementedError()

def quick_query(self, s_context, topic_query="", filter_queries=[], match_strategy='longest'):
"""makes sure query result is defined as subcorpus.
without topic query:
- finds all s_context spans that contain at least one filter_query
with topic query:
- finds all s_context spans that contain topic_query and all filter_queries
:return: identifier (name of NQR on disk)
:rtype: str
"""

if len(topic_query) == 0:
identifier = generate_idx([self.subcorpus, filter_queries, s_context, match_strategy], prefix='Query')

cqp = self.start_cqp()
cqp.Exec(f'set MatchingStrategy "{match_strategy}";')
size = int(cqp.Exec(f'size {identifier};'))

if size == 0:
disjunction = " | ".join(['(' + q + ')' for q in filter_queries])
logger.info(f'disjunction query: {disjunction}')
cqp.Query(f'{identifier} = {disjunction} within {s_context} expand to {s_context};')
logger.info(f'.. saving {identifier} in CWB binary format')
cqp.Exec(f'save {identifier};')

return identifier

# IDENTIFY
topic_identifier = generate_idx([self.subcorpus, topic_query, s_context, match_strategy], prefix='Query')
filter_identifier = generate_idx([self.subcorpus, topic_query, s_context, match_strategy, filter_queries], prefix='Query')

# CHECK CQP
cqp = self.start_cqp()
cqp.Exec(f'set MatchingStrategy "{match_strategy}";')
size = int(cqp.Exec(f'size {filter_identifier};'))

if size == 0:

# TODO: avoid saving twice if there's no filter
size = int(cqp.Exec(f'size {topic_identifier};'))

logger.info(f'topic query: {topic_query}')
if size == 0:
# TOPIC
cqp.Query(f'{topic_identifier} = {topic_query} expand to {s_context};')
logger.info(f'.. saving {topic_identifier} in CWB binary format')
cqp.Exec(f'save {topic_identifier};')
logger.info('.. size: ' + cqp.Exec(f'size {topic_identifier};'))

# FILTER
cqp.Exec(f'{filter_identifier} = {topic_identifier};')
for query in filter_queries:
logger.info(f'filter query: {query}')
cqp.Exec(f'{filter_identifier};')
cqp.Query(f'{filter_identifier} = {query} expand to {s_context};')
logger.info('.. size: ' + cqp.Exec(f'size {filter_identifier};'))

# SAVE
logger.info(f'.. saving {filter_identifier} in CWB binary format')
cqp.Exec(f'save {filter_identifier};')

cqp.__kill__()

return filter_identifier

def quick_conc(self, topic_query, s_context, window, order=42,
cut_off=100, highlight_queries=dict(),
filter_queries=dict(), p_show=['word'], s_show=[],
match_strategy='longest'):
"""
:return: concordance lines, each one a dict
:rtype: list(dict)
"""

if len(topic_query) == 0:

queries = {**highlight_queries, **filter_queries}

# INIT CQP
identifier = self.quick_query(s_context, topic_query="", filter_queries=queries.values(), match_strategy=match_strategy)
cqp = self.start_cqp()

# init CONTEXT (TextConstellation)
cqp.Exec(f'cut {identifier} {cut_off};')
df_context = cqp.Dump(f'{identifier};')
dump_context = Dump(self.copy(), df_context, None)
dump_context = dump_context.set_context(context_break=s_context)
df_context = dump_context.df[['contextid']]
df_context = df_context.reset_index().set_index('contextid')

# HIGHLIGHT
cqp.Exec(f'{identifier};')
for name, query in queries.items():
cqp.Exec(f'Temp = {query};')
df_query = cqp.Dump('Temp;')
if len(df_query) > 0:
dump_query = Dump(self.copy(), df_query, None)
dump_query = dump_query.set_context(context_break=s_context)
df_query = dump_query.df[['contextid']]
df_agg = aggregate_matches(df_query, name)
df_context = df_context.join(df_agg)
else:
df_context[name] = None
df_context[name + '_BOOL'] = False
df_context[name + '_COUNTS'] = 0
cqp.__kill__()

# index by CONTEXT MATCHES
df = df_context.set_index(['match', 'matchend'])
names = list(queries.keys())
names_bool = [n + '_BOOL' for n in names]
names_count = [n + '_COUNTS' for n in names]
for b, c in zip(names_bool, names_count):
df[b] = df[b].fillna(False)
df[c] = df[c].fillna(0)

# ACTUAL CONCORDANCING
conc = Concordance(self.copy(), df)
lines = conc.lines(form='dict', p_show=p_show, s_show=s_show, order=order, cut_off=cut_off)
output = lines.apply(lambda row: format_roles(row, names, s_show=names_bool+s_show, window=0, htmlify_meta=True), axis=1)

else:

# INIT CQP
identifier = self.quick_query(s_context, topic_query, filter_queries.values(), match_strategy)
cqp = self.start_cqp()

# init CONTEXT (TextConstellation)
cqp.Exec(f'{identifier};')
df_context = cqp.Dump(f'{identifier};')
dump_context = Dump(self.copy(), df_context, None)
dump_context = dump_context.set_context(window, s_context)
df_context = dump_context.df[['contextid', 'context', 'contextend']]

# index by TOPIC MATCHES
cqp.Exec(f'Temp = {topic_query};')
df_query = cqp.Dump('Temp;')
dump_query = Dump(self.copy(), df_query, None)
dump_query = dump_query.set_context(window, s_context)
df_context = dump_left_join(df_context, dump_query.df, 'topic', drop=True, window=window)
df_context = df_context.set_index(['match_topic', 'matchend_topic'])
df_context.index.names = ['match', 'matchend']

# FILTER according to window size
for name, query in filter_queries.items():
cqp.Exec(f'Temp = {query};')
df_query = cqp.Dump('Temp;')
dump_query = Dump(self.copy(), df_query, None)
dump_query = dump_query.set_context(window, s_context)
df_context = dump_left_join(df_context, dump_query.df, name, drop=True, window=window)
df_context = df_context.drop([c + "_" + name for c in ['match', 'matchend', 'offset']], axis=1)

# HIGHLIGHT
for name, query in highlight_queries.items():
cqp.Exec(f'Temp = {query};')
df_query = cqp.Dump('Temp;')
dump_query = Dump(self.copy(), df_query, None)
dump_query = dump_query.set_context(window, s_context)
df_context = dump_left_join(df_context, dump_query.df, name, drop=False, window=window)

cqp.__kill__()

# ACTUAL CONCORDANCING
hkeys = list(highlight_queries.keys())
df = group_lines(df_context, hkeys)
conc = Concordance(self.copy(), df)
lines = conc.lines(form='dict', p_show=p_show, s_show=s_show, order=order, cut_off=cut_off)
output = lines.apply(lambda row: format_roles(row, hkeys, s_show, window, htmlify_meta=True), axis=1)

return list(output)
Loading

0 comments on commit b5f67f7

Please sign in to comment.