Skip to content

Commit

Permalink
CRAN release v0.5.2
Browse files Browse the repository at this point in the history
  • Loading branch information
Andreas Blätte authored and Andreas Blätte committed Mar 30, 2022
2 parents 6c88e6c + e587209 commit 1127ea0
Show file tree
Hide file tree
Showing 94 changed files with 8,354 additions and 1,939 deletions.
2 changes: 2 additions & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
^src/cwb/doc$
^src/cwb/man$
^src/cwb/instutils$
^src/cwb/editline$
^_pkgdown\.yml$
^docs$
^CONDUCT.md$
Expand All @@ -15,3 +16,4 @@
^include$
^CRAN-RELEASE$
^\.github$
^CRAN-SUBMISSION$
43 changes: 25 additions & 18 deletions .github/workflows/R-CMD-check.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ jobs:
matrix:
config:
- {os: windows-latest, r: 'release'}
- {os: windows-2022, r: 'devel'}
# - {os: windows-2022, r: 'devel'}
- {os: macOS-latest, r: 'release'}
- {os: ubuntu-20.04, r: 'release', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest"}
- {os: ubuntu-20.04, r: 'devel', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest"}
Expand All @@ -46,18 +46,18 @@ jobs:
http-user-agent: ${{ matrix.config.http-user-agent }}
use-public-rspm: true

- name: Setup R (Windows UCRT)
if: matrix.config.os == 'windows-2022'
uses: kalibera/ucrt3/actions/r-install@main
# - name: Setup R (Windows UCRT)
# if: matrix.config.os == 'windows-2022'
# uses: kalibera/ucrt3/actions/r-install@main

- name: Install UCRT toolchain
if: matrix.config.os == 'windows-2022'
uses: kalibera/ucrt3/actions/toolchain-install@main
with:
# base ... toolchain has the compilers and libraries to build R and recommended packages
# full ... additional libraries to build CRAN packages
# none ... no toolchain is needed (no native code)
toolchain-type: full
# - name: Install UCRT toolchain
# if: matrix.config.os == 'windows-2022'
# uses: kalibera/ucrt3/actions/toolchain-install@main
# with:
# # base ... toolchain has the compilers and libraries to build R and recommended packages
# # full ... additional libraries to build CRAN packages
# # none ... no toolchain is needed (no native code)
# toolchain-type: full

- uses: r-lib/actions/setup-pandoc@v2

Expand Down Expand Up @@ -107,12 +107,19 @@ jobs:
rcmdcheck::rcmdcheck(args = c("--no-manual", "--as-cran"), error_on = "warning", check_dir = "check")
shell: Rscript {0}

- name: Check package (UCRT)
if: matrix.config.os == 'windows-2022'
env:
_R_INSTALL_TIME_PATCHES_: no
TZ: UTC
uses: kalibera/R-actions/pkg-check@master
# - name: Whereabouts of gcc and g++
# if: matrix.config.os == 'windows-2022'
# run: |
# echo %PATH%
# where gcc
# where g++

# - name: Check package (UCRT)
# if: matrix.config.os == 'windows-2022'
# env:
# _R_INSTALL_TIME_PATCHES_: no
# TZ: UTC
# uses: kalibera/R-actions/pkg-check@master

- name: Build Windows binary package
if: matrix.os == 'windows-latest'
Expand Down
2 changes: 0 additions & 2 deletions CRAN-RELEASE

This file was deleted.

18 changes: 9 additions & 9 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
Package: RcppCWB
Type: Package
Title: 'Rcpp' Bindings for the 'Corpus Workbench' ('CWB')
Version: 0.5.0
Date: 2022-01-31
Version: 0.5.2
Date: 2022-03-28
Author: Andreas Blaette [aut, cre],
Bernard Desgraupes [aut],
Sylvain Loiseau [aut],
Expand Down Expand Up @@ -40,21 +40,21 @@ License: GPL-3
Encoding: UTF-8
Copyright: For the copyrights for the 'Corpus Workbench' (CWB) and acknowledgement of authorship, see file COPYRIGHTS.
NeedsCompilation: yes
SystemRequirements: GNU make, ncurses, pcre (>= 7 < 10), GLib (>= 2.0.0). On Windows, no prior
installations are necessary, as pre-built (i.e. cross-compiled) binaries of required libraries are
downloaded from a GitHub repository (<https://github.com/PolMine/libcl>) during installation. On
macOS, static libraries of Glib are downloaded (<https://github.com/PolMine/libglib>) if Glib is
not present.
SystemRequirements: GNU make, pcre (>= 7 < 10), GLib (>= 2.0.0). On Windows, no prior installations are necessary,
as pre-built (i.e. cross-compiled) binaries of required libraries are downloaded from a GitHub repository
(<https://github.com/PolMine/libcl>) during installation. On macOS, static libraries of Glib are downloaded
(<https://github.com/PolMine/libglib>) if Glib is not present.
Imports:
Rcpp (>= 1.0.7)
Rcpp (>= 1.0.7),
fs
Suggests:
knitr,
testthat
LinkingTo: Rcpp
Biarch: true
URL: https://github.com/PolMine/RcppCWB
BugReports: https://github.com/PolMine/RcppCWB/issues
RoxygenNote: 7.1.1
RoxygenNote: 7.1.2
Roxygen: list(markdown = TRUE)
Collate:
'RcppCWB_package.R'
Expand Down
29 changes: 28 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -2,28 +2,38 @@

export(check_corpus)
export(check_cpos)
export(check_cqp_query)
export(check_id)
export(check_p_attribute)
export(check_pkg_registry_files)
export(check_query)
export(check_region_matrix)
export(check_registry)
export(check_s_attribute)
export(check_strucs)
export(cl_charset_name)
export(cl_delete_corpus)
export(cl_find_corpus)
export(cl_struc_values)
export(corpus_data_dir)
export(corpus_is_loaded)
export(cpos_to_id)
export(cpos_to_lbound)
export(cpos_to_rbound)
export(cpos_to_str)
export(cpos_to_struc)
export(cqp_drop_subcorpus)
export(cqp_dump_subcorpus)
export(cqp_get_registry)
export(cqp_initialize)
export(cqp_is_initialized)
export(cqp_list_corpora)
export(cqp_list_subcorpora)
export(cqp_load_corpus)
export(cqp_query)
export(cqp_reset_registry)
export(cqp_subcorpus_size)
export(cqp_verbosity)
export(cwb_charsets)
export(cwb_compress_rdx)
export(cwb_encode)
export(cwb_huffcode)
Expand All @@ -34,12 +44,29 @@ export(get_count_vector)
export(get_pkg_registry)
export(get_region_matrix)
export(get_tmp_registry)
export(id_to_cpos)
export(id_to_freq)
export(p_attr)
export(p_attr_lexicon_size)
export(p_attr_size)
export(ranges_to_cpos)
export(regex_to_id)
export(region_matrix_context)
export(region_matrix_to_count_matrix)
export(region_matrix_to_ids)
export(s_attr)
export(s_attr_is_descendent)
export(s_attr_is_sibling)
export(s_attr_relationship)
export(s_attr_size)
export(s_attribute_decode)
export(str_to_id)
export(struc_to_cpos)
export(struc_to_str)
export(use_tmp_registry)
exportPattern("^[[:alpha:]]+")
importFrom(Rcpp,evalCpp)
importFrom(fs,path)
importFrom(fs,path_expand)
importFrom(utils,capture.output)
useDynLib(RcppCWB, .registration = TRUE)
99 changes: 99 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,102 @@
# RcppCWB 0.5.2

* The example for `corpus_data_dir()` dir not work as intended without
explicitly setting the `registry` argument. Fixed.
* New functions `corpus_info_file()`, `corpus_full_name()`,
`corpus_p_attributes()`, `corpus_s_attributes()`, `corpus_properties()` and
`corpus_property()` to retrieve registry file data.
* New function `corpus_registry_dir()`.
* The path to the info file in the registry file of the REUTERS corpus was
broken. Fixed.


# RcppCWB 0.5.1

## New Features

* New auxiliary function `cwb_charsets()` reports the charsets supported by CWB.
* New functions `cl_load_corpus()` and `cqp_load_corpus()` do what the functions
suggests.
* New function `cl_list_corpora()` complements existing function
`cqp_list_corpora()` for the CL context.
* New arguments `skip_blank_lines`, `strip_whitespace` and `xml` of
`cwb_encode()` open configuration options of `cwb_encode()`, overcoming the
previously hard-coded equivalent to the command-line option "-xsB".(#38)
* Unexported functions `.cpos_to_id()`, `.cl_find_corpus()` and
`.cl_new_attribute()` are an entry to passing around pointers, rather than
re-creating objects whenever switching from R to C.
* Functions `.s_attr()` and `.p_attr()` return pointers for a s- or
p-attribute.
* Functions `cl_*` are now available with pointer as input (e.g. `cpos_to_id()`).
* The CORPUS_REGISTRY environment variable is not set to the temporary registry,
to avoid often confusing behavior and collissions whent loading RcppCWB and
polmineR at the same time (#13).
* The `cqp_drop_subcorpus()` function that has been disabled temporarily is
usable again (#34).
* `cqp_query()` is now able to process subcorpora.
* `RcppCWB:::.cqp_subcropus()` will construct a subcorpus from a region matrix.
* The `check_corpus()` does not re-set the registry directory and more, but tries
to load the checked corpus if it has not yet been loaded.
* A new function `s_attr_relationship()` will detect whether two s-attributes are
siblings, or in a descendent or ancestor relationship.
* Functions `cwb_encode()`, `cwb_huffcode()`, `cwb_makeall()` and
`cwb_compress_rdx()` now have an argument `quietly` to control display of output
messages. `cwb_encode()` has an argument `verbose` to control whether counter on
the number of tokens processed is dislpayed.


## Minor improvements

* Difficulties of `cwb_encode()` to digest variations of path statements between
macOS and Windows are addressed using a reliable normalization of paths with
`fs::path()` (#48).
* Argument `encoding` is checked for the validity of the encoding passed in
(#34).
* A patch introducing a sanity check omits 'stringop-overflow' compiler warning
thrown by file cl/cdaccess.c on Windows (#45).
* An update of Xcode command line developer tools includes flex 2.6.4
Apple(flex-34), and this is the version used not, resulting and extensive code
changes in cl/lex.creg.c and cqp/lex.yy.c, yet without causing new errors or
changing the functionality.
* `check_cpos()` issues a warning if argument `cpos` is `NULL` (#21).
* Functions `cl_cpos2id()`, `cl_cpos2lbound()`, `cl_cpos2rbound()`,
`cl_cpos2str()` and `cl_cpo2struc()` will return an empty, zero-length integer
vector if argument `cpos` is `NULL` (#21).
* Warnings issued by `check_corpus()` (used internally by many functions)
resulted from slightly differing representations of otherwise identical
paths. Using `fs::path()` for path for normalization internally will omit
misleading warning messages.
* `cqp_get_registry()` will now return a `fs::path` object, as a safeguard for
a consistent normalization of paths.
* Function `cl_delete_corpus()` will now (visibly) return a `logial` value.
* The check for the availability of ncurses is omitted in the configure file
and the editline subdirectory of src/cwb is included in .Rbuildignore to
minimize the size of the tarball. The ncurses library is a dependency of
editline, but editline is not built in the context of this package (#26).
* `cqp_load_corpus()` will return `FALSE` if corpus has not been loaded
successfully.
* Disaggregated `wrappers.cpp` into `cl.cpp`, `cqp.cpp` and `utils.cpp`, so that
the code is organized more coherently corresponding to the different logics.
* Function `check_cqp_query()` renamed to `check_query()` to avoid a conflict
with a function defined in the polmineR package.
* `cqp_list_subcorpora()` returns a `character` vector. Previously, we just had
obscure printed messages.
* `s_attribute_decode()` will not break if s-attribute has no values (#54).
* Functions `cl_struc2str()` and `cl_struc2cpos()` may now include negative
values, the vectors returned will have `NA` values at respective positions. The
check against negative values in `check_strucs` is dropped accordingly.


## Bux fixes

* The `cwb_encode()` function did not declare structural attributes in the
registry and mistakenly channeled output for the file to the terminal (#49).
Fixed.
* Re-running `cwb_encode()` did not reset global variables, which resulted in a
set of errors. Solved. (#51)



# RcppCWB 0.5.0

## New Features
Expand Down
16 changes: 6 additions & 10 deletions R/RcppCWB_package.R
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@
#' The CWB is a classical tool which has inspired a set of developments. A
#' persisting advantage of the CWB is its mature, open source code base that
#' is actively maintained by a community of developers. It is used as a robust
#' and efficient backend for widely used tools such as TXM or CQPweb
#' and efficient backend for widely used tools such as
#' TXM(\url{https://txm.gitpages.huma-num.fr/textometrie/}) or CQPweb
#' (\url{https://cwb.sourceforge.io/cqpweb.php}). Its uncompromising C
#' implementation guarantees speed and makes it well suited to be integrated
#' with R at the same time.
Expand Down Expand Up @@ -117,27 +118,22 @@
#' # functions of the corpus library (starting with cl) expose the low-level
#' # access to the CWB corpus library (CL)
#'
#' # registry <- if (!check_pkg_registry_files()) use_tmp_registry() else get_pkg_registry()
#' registry <- use_tmp_registry()
#' print(registry)
#' ids <- cl_cpos2id("REUTERS", cpos = 1:20, p_attribute = "word", registry = registry)
#' tokens <- cl_id2str("REUTERS", id = ids, p_attribute = "word", registry = registry)
#' ids <- cl_cpos2id("REUTERS", cpos = 1:20, p_attribute = "word", registry = get_tmp_registry())
#' tokens <- cl_id2str("REUTERS", id = ids, p_attribute = "word", registry = get_tmp_registry())
#' print(paste(tokens, collapse = " "))
#'
#' # To use the corpus query processor (CQP) and its syntax, it is necessary first
#' # to initialize CQP (example: get concordances of 'oil')
#'
#' cqp_initialize(registry)
#' cqp_query("REUTERS", query = '[]{5} "oil" []{5}')
#' cpos_matrix <- cqp_dump_subcorpus("REUTERS")
#' concordances_oil <- apply(
#' cpos_matrix, 1,
#' function(row){
#' ids <- cl_cpos2id("REUTERS", p_attribute = "word", cpos = row[1]:row[2])
#' tokens <- cl_id2str("REUTERS", p_attribute = "word", id = ids)
#' ids <- cl_cpos2id("REUTERS", p_attribute = "word", cpos = row[1]:row[2], get_tmp_registry())
#' tokens <- cl_id2str("REUTERS", p_attribute = "word", id = ids, get_tmp_registry())
#' paste(tokens, collapse = " ")
#' }
#' )
#'
NULL

Loading

0 comments on commit 1127ea0

Please sign in to comment.