(jyun/org-latex-set-options 2.5)
- [2021-05-09 Sun] note I drop event description and tryed a few more embeddings.
it seems incorrect process actions have low cosine similarity. This means they are difficult to form a semantic cluster(s). actions associated to the task (or problem solving) has high similarity. so as long as event description reoains the same, I can set the subsequence as a new action. embedding of a new action is determined by the mean (center) embeddings of its members.
source("email_word2vec_preproc.r")
email %>% filter(SEQID == 101) %>% select(event_type, event_description, timestamp, diff)
- initial location
- inbox: 100s
- can come: 200s
- cannot come: 301
- response = (0 - 3)
- can come: 101, 104
- cannot come: 102
- can come: 201, 202 (don’t move, or move but put back(e.g
TOOLBAR_trash
)) - anywhere: 103, 105
- discard actions being taken for very short time (100ms)
- otherwise embeddings doesn’t help much
seqs[4]
\[ p\left(wj \mid w0, u, v\right)=\frac{exp \left(u\left(w0\right)\top v\left(wj\right)\right)}{∑w ∈ V exp \left(u\left(w0\right)\top v(w)\right)} \] where \(u: V → \mathbb{R}k \) and \(v: V → \mathbb{R}k\) are functions which map words to a word embedding—one for the pivot words, and the other for context.
- action based transition: >= 500
- embeddings based transition (
embedding_dim
)
The intensity function $qml(⋅)$ represents the instantaneous risk of moving from action
- calendar: TAB-id=tabbutton2
- ticketing: TAB-id=tabbutton1
- event type: COMBOBOX-id=u021_default_menu1|*$index=7 (football)
- location: COMBOBOX-id=u021_default_menu2|*$index=2 (Bakerton)
- response = 1 (correct) 7 (incorrect) 0 missing menu1 and 6 = 9 (8 seems ok too)
- N = 1344 src_R[:session R-PIACC :exports results]{length(unique(df$SEQID))}
CD Tally
- https://www.mrc-bsu.cam.ac.uk/wp-content/uploads/2017/10/multistate_enar_webinar.pdf
- https://cran.r-project.org/web/packages/msm/msm.pdf
- cite:hill_relaxing_2021
(delete-directory "input" t)
(make-directory "input")
piacc_orig_path = "./data/PIAAC_cleaned_data_1110/Problem_solving/Problem-solving_cleaned_1110.rdata"
load(piacc_orig_path)
item = PS
b2012 = readr::read_csv("./data/PIAAC_cleaned_data_1110/Prgusap1_2012.csv")
b2017 = readr::read_csv("./data/PIAAC_cleaned_data_1110/Prgusap1_2017.csv")
## b2012 = foreign::read.spss("./data/PIAAC_cleaned_data_1110/Prgusap1_2012.sav", to.data.frame = T)
## b2017 = foreign::read.spss("./data/PIAAC_cleaned_data_1110/Prgusap1_2017.sav", to.data.frame = T)
iid = unique(item$SEQID)
b7id = unique(b2017$SEQID)
b2id = unique(b2012$SEQID)
kk = 0
for (id in iid) {
if (any(id == b2id) && any(id == b7id)) error("duplicated id")
kk = kk + sum(id == b2id)
kk = kk + sum(id == b7id)
}
jj = 0
for (ii in names(b2017)) {
if (any(ii == names(b2012))) jj = jj +1
}
- preprocess data, word2vec, prepare for C, run C, post analysis!
ipath = "input"
if (dir.exists(paths)) {
do.call(file.remove, list(list.files("input", full.names = TRUE)))
} else dir.create(ipath)
source("R/itemcode.R")
out_dir = paste0(booklet$NAME[booklet$CODEBOOK == item_code], "/")
system(paste0("figlet ", item_code))
system(paste0("figlet ", out_dir))
source("R/preproc-data.R")
system(paste0("sh run.sh ", out_dir))
ITEMCODE
item_code = “U01a000S” # party-1 item_code = “U07x000S” # book order item_code = “U21x000S” # ticket
## item_code = "U04a000S" ## cannot convert to long format (msm)
## item_code = "U01b000S"
item_code = "U19a000S"
source("R/all_in_one.R")
item_code = "U07x000S"
source("R/all_in_one.R")
item_code = "U02x000S"
source("R/all_in_one.R")
item_code = "U16x000S"
source("R/all_in_one.R")
item_code = "U11b000S"
source("R/all_in_one.R")
item_code = "U23x000S"
source("R/all_in_one.R")
item_code = "U06a000S"
source("R/all_in_one.R")
library(diprom)
## library(dplyr)
## library(stringr)
## library(msm)
## library(foreach)
## library(doParallel)
stopImplicitCluster()
registerDoParallel(cores = detectCores() - 1)
## doParallel::registerDoParallel(2)
setwd('~/workspace/procmod-code/')
piacc_orig_path = "./data/PIAAC_cleaned_data_1110/Problem_solving/Problem-solving_cleaned_1110.rdata"
piacc_path = "./data/PIAAC_cleaned_data_1110/Problem_solving/Problem-solving_no_missing.rdata"
## piacc_na.omit()
piacc_background_path = "./data/PIAAC_cleaned_data_1110/Prgusap1_2017.csv"
piacc_background_spss = "./data/PIAAC_cleaned_data_1110/Prgusap1_2017.sav"
## xxx = foreign::read.spss("./data/PIAAC_cleaned_data_1110/Prgusap1_2017.sav")
item = read_piacc(piacc_path, item_code, sub_str, ignore_str)
item2sen(item)
item %>% group_by(SEQID) %>% summarise(mnum = max(event_num))
Run Python codes to do word2vec
.
system(". activate tf; python piacc_word2vec.py")
metadata_path = "input/metadata.tsv"
vectors_path = "input/vectors.tsv"
metadata = readr::read_tsv(metadata_path, col_names = FALSE)
vectors = readr::read_tsv(vectors_path, col_names = FALSE)
item = filter_item(item)
Q = get_trans(item)
Dml = get_cosdist()
dat = item2long(item)
M = length(state)
N = length(unique(dat$id))
dq = nrow(Q)
dn = nrow(dat)
dc = ncol(dat)
write_data()
source("R/init.R")
write_loop_index()
read_piacc = function(piacc_path, item_code, sub_str, ignore_str, core_event = NULL) {
load(piacc_path)
item = PS %>% filter(CODEBOOK == item_code)
## core_event = c("MAIL_DRAG", "MAIL_MOVED", "MAIL_VIEWED")
## email$event_description[!(email$event_type %in% core_event)] <- ""
timestamp = item$timestamp
diff = c(diff(timestamp), 99999)
item$diff = diff
item = item[(diff >= 10) || (diff < 0 ), ]
if (length(sub_str) != 0) {
for (ii in 1:nrow(sub_str)){
item$event_description = stringr::str_replace_all(item$event_description,
sub_str[ii,1], sub_str[ii,2])}
}
if (length(ignore_desc) != 0) {
for (ii in 1:length(ignore_desc)) {
item = item %>% mutate(event_description = ifelse(event_type == ignore_desc[ii],"",event_description)) }
}
item = item %>%
mutate(event_concat = ifelse(event_description == "", event_type, paste0(event_type,"-",event_description))) %>%
mutate(word = gsub(" ", "_", event_concat))
return(item)
}
item2sen = function(item, sid = NULL) {
ww = item$word
id = item$SEQID
ww0 = ww[1:(length(ww)-1)]
ww1 = ww[2:(length(ww))]
dup = ww0 == ww1
dup0 = c(dup, FALSE )
dup1 = c(FALSE , dup)
idx = dup1
cw = "NULL"
cid = 9999999999
for (kk in which(dup1)) {
if(cw != ww[kk] && cid != id[kk]) idx[kk] = FALSE
cw = ww[kk]
cid = id[kk]
}
item = item[!idx, ]
n = nrow(item)
pre = item$word[1:(n-1)]
cur = item$word[2:n]
dif = c(TRUE, !(cur == pre))
idx = logical(n)
for (i in 2:n) {
if(dif[i] == FALSE && dif[i-1] == FALSE) {
idx[i] = TRUE
}
}
item = item[!idx,]
id = unique(item$SEQID)
seqs = character(length(id))
for (i in id) {
## for (word in item$word[item$SEQID == i]) {
## seqs[i] = paste0(seqs[i], " ", word)
## }
seqs[i] = paste(item$word[item$SEQID == i] , collapse = " ")
}
## for (i in id) {
## seqs[i] = gsub("START ", "", seqs[i])
## seqs[i] = gsub(" END", "", seqs[i])
## }
if (is.null(sid)) {
seqs = seqs[id]
} else {
seqs = seqs[sid]
}
## weird quotation marks
## data.table::fwrite(as.data.frame(seqs), "input/item_sentence.txt", col.names=F)
## all sentences are quotation marked
## readr::write_delim(as.data.frame(seqs), "input/item_sentence.txt", col_names=F)
str2file(seqs, "input/item_sentence.txt")
}
filter_item = function(item) {
## a few more processing
## state to global env
item = item %>% filter(word !="START")
state <<- unique(item$word)
ntate = numeric(nrow(item))
ii = 0
for (ww in item$word) {
ii = ii + 1
ntate[ii] = which(state == ww)
}
item$state = ntate
uid = unique(item$SEQID)
pid = numeric(nrow(item))
ii = 0
for (nn in item$SEQID) {
ii = ii + 1
pid[ii] = which(uid == nn)
}
item$pid = pid
return(item)}
get_trans = function(item) {
id = unique(item$SEQID)
L = length(state <<- unique(item$word))
Q = matrix(0, L, L)
for (i in id) {
## for (word in item$word[item$SEQID == i]) {
## seqs[i] = paste0(seqs[i], " ", word)
## }
seq = item$state[item$SEQID == i]
for (k in 1:(length(seq)-1))
Q[seq[k], seq[k+1]] = 1
## cat(seq[length(seq)-1])
}
rsq = rowSums(Q) > 0
diag(Q) = 1 * rsq
return(Q)}
get_cosdist = function() {
dem = ncol(vectors)
vv = array(0, dim=c(M <- length(state), dem))
for (m in 1:M) {
if (sum(state[m] == metadata[,1]) > 0)
vv[m, ] = unlist(vectors[which(state[m] == metadata[,1]), ] )
}
Dml = cosdist(as.matrix(vv))
return(Dml)
}
item2long = function(item) {
df = item %>% select(pid, timestamp, response, state)
dat <- msm::msm2Surv(data=df, subject="pid", time="timestamp", state="state",
Q=Q)
##dat
## attr(dat, "trans")
dat$time = dat$time / max(dat$Tstop)
dat$Tstart = dat$Tstart / max(dat$Tstop)
dat$Tstop = dat$Tstop / max(dat$Tstop)
dat$dist = foreach (i = 1:nrow(dat), .combine="c") %dopar%
Dml[dat$from[i], dat$to[i]]
return(dat)
}
write_data = function(){
## R to C index conversion
wat = dat
wat$id = wat$id - 1
wat$from = wat$from - 1
wat$to = wat$to - 1
## to be read from cpp
readr::write_csv(as.data.frame(item),"input/item.csv", col_names = TRUE)
readr::write_csv(as.data.frame(state),"input/state.csv", col_names = FALSE)
readr::write_csv(wat,"input/dat.csv", col_names = FALSE)
readr::write_csv(as.data.frame(Q),"input/trans.csv", col_names = FALSE)
readr::write_csv(as.data.frame(cbind(M, N, dq, dn, dc)),"input/mvar.csv", col_names = FALSE)
readr::write_csv(data.frame(Dml), "input/Dml.csv", col_names = FALSE)
}
binfo = readr::read_csv("procmod-code/data/PIAAC_cleaned_data_1110/Prgusap1_2012.csv")
## binfo = foreign::read.spss("./data/PIAAC_cleaned_data_1110/Prgusap1_2012.sav")
item = readr::read_csv("input/item.csv")
pal = readr::read_csv("./data/PIAAC_cleaned_data_1110/PUFs_values.csv", col_types = readr::cols(.default = readr::col_character()))
pal[,1] = pal[,1] %>% unlist %>% toupper
vname = unique(pal[,1]) %>% unlist
bname = names(binfo)
SAS = pal[,3] %>% unlist %>% str_replace_all("\\.", "")
SPSS = pal[,4] %>% unlist
ii = 827 LNG_BQ
for (ii in 1:length(vname)) {
if (any(vname[ii] == bname)) {
binfo[,vname[ii]] = plyr::mapvalues(unlist(binfo[,vname[ii]]), SAS[pal[,1] == vname[ii]], SPSS[pal[,1] == vname[ii]])
## binfo[is.na(binfo[,vname[ii]]),vname[ii]] = "999999999999"
binfo[,vname[ii]] = binfo[,vname[ii]] %>% mutate_if(is.character,as.factor) %>% mutate_if(is.factor, as.numeric)
binfo[is.na(binfo[,vname[ii]]),vname[ii]] = 999999999999
}}
binfo = data.frame(SEQID = binfo$SEQID, binfo[, names(binfo) %in% vname])
binfo[is.na(binfo)] = 999999999999
readr::write_csv(binfo, "./data/PIAAC_cleaned_data_1110/PUFs_spss.csv")
for (ii in 6:ncol(binfo)) {
## binfo[,ii] = plyr::mapvalues(unlist(binfo[,ii]), "N", 9^50) %>% as.numeric
binfo[is.na(binfo[,ii]),ii] = -9^50
}
readr::write_csv(binfo, "./data/PIAAC_cleaned_data_1110/PUFs_noN.csv")
## binfo = readr::read_csv("./data/PIAAC_cleaned_data_1110/Prgusap1_2012.csv")
binfo = foreign::read.spss("./data/PIAAC_cleaned_data_1110/Prgusap1_2012.sav", to.data.frame = TRUE) %>% as.data.frame
item = readr::read_csv("input/item.csv")
var = readr::read_csv("./data/PIAAC_cleaned_data_1110/PUF_Variables.csv")
pal = readr::read_csv("./data/PIAAC_cleaned_data_1110/PUFs_values.csv", col_types = readr::cols(.default = readr::col_character()))
pal[,1] = pal[,1] %>% unlist %>% toupper
vname = unique(pal[,1]) %>% unlist
SAS = pal[,3] %>% unlist %>% str_replace_all("\\.", "")
SPSS = pal[,4] %>% unlist
bname = names(binfo)
vfact = pal %>% group_by(pal[,1]) %>% summarise(n = n())
plist = list()
for (ii in unique(unlist(pal[,1]))) {
tmp = pal[pal[,1] == ii,]
plist[[ii]] = data.frame(Value = tmp[,2] ,SAS = tmp[,3], SPSS = tmp[,4], Type = tmp[,5])
}
ii = 827 LNG_BQ
pinfo = data.frame(SEQID = binfo$SEQID)
for (ii in vname) {
if (any(ii == bname)) {
if (vfact[vfact[,1] == ii,2] < 4) {
pinfo[,ii] = as.character(binfo[,ii])
pinfo[,ii] = plyr::mapvalues(unlist(pinfo[,ii]), plist[[ii]][,1], plist[[ii]][,3])
pinfo[is.na(pinfo[,ii]),ii] = "999999999999"
## binfo[,vname[ii]] = binfo[,vname[ii]] %>% mutate_if(is.character,as.factor) %>% mutate_if(is.factor, as.numeric)
pinfo[,ii] = as.numeric(pinfo[,ii])
}
else
pinfo[,ii] = as.numeric(binfo[,ii])
pinfo[is.na(pinfo[,ii]),ii] = 999999999999
}}
readr::write_csv(pinfo, "./data/PIAAC_cleaned_data_1110/PUFs_spss.csv")
init_val = list(
kappa = rep(0.1, M),
tau = rep(0.1, N),
theta = rep(0.1, N),
beta = 0,
sigma = 1,
mu_beta = 0,
sigma_beta = 1
)
## lambda
params = c(init_val, list(
a_kappa = matrix(0.1,M,1),
b_kappa = matrix(0.1,M,1),
jump_kappa = matrix(0.5,M,1),
jump_beta = matrix(0.1,1,1),
## mu_theta = matrix(0.0,N,1)
## sigma_theta = matrix(1.0,N,1)
jump_theta = matrix(1.0,N,1),
a_sigma = 1.0,
b_sigma = 1.0,
a_tau = matrix(0.1,N,1),
b_tau = matrix(0.1,N,1),
jump_tau = matrix(0.5,N,1),
a_sigma_beta = 1.0,
b_sigma_beta = 1.0,
mu_mu_beta = 0.0,
sigma_mu_beta = 10.0
))
write_init(params)
write_init = function(params) {
## parameter init
readr::write_csv(as.data.frame(params$kappa),"input/init_kappa.csv", col_names = FALSE)
readr::write_csv(as.data.frame(params$tau),"input/init_tau.csv", col_names = FALSE)
readr::write_csv(as.data.frame(params$theta),"input/init_theta.csv", col_names = FALSE)
readr::write_csv(as.data.frame(params$sigma),"input/init_sigma.csv", col_names = FALSE)
readr::write_csv(as.data.frame(params$beta),"input/init_beta.csv", col_names = FALSE)
readr::write_csv(as.data.frame(params$mu_beta),"input/init_mu_beta.csv", col_names = FALSE)
readr::write_csv(as.data.frame(params$sigma_beta),"input/init_sigma_beta.csv", col_names = FALSE)
## hyper-parameter + jump
readr::write_csv(as.data.frame(cbind(params$a_kappa,params$b_kappa,params$jump_kappa)),"input/hyper_kappa.csv", col_names = FALSE)
readr::write_csv(as.data.frame(params$jump_beta),"input/hyper_beta.csv", col_names = FALSE)
readr::write_csv(as.data.frame(params$jump_theta),"input/hyper_theta.csv", col_names = FALSE)
readr::write_csv(as.data.frame(cbind(params$a_sigma,params$b_sigma)),"input/hyper_sigma.csv", col_names = FALSE)
readr::write_csv(as.data.frame(cbind(params$a_tau,params$b_tau,params$jump_tau)),"input/hyper_tau.csv", col_names = FALSE)
readr::write_csv(as.data.frame(cbind(params$a_sigma_beta,params$b_sigma_beta)),"input/hyper_sigma_beta.csv", col_names = FALSE)
readr::write_csv(as.data.frame(cbind(params$mu_mu_beta,params$sigma_mu_beta)),"input/hyper_mu_beta.csv", col_names = FALSE)
}
write_loop_index_deprecated = function() {
sink("input/out-of-state_index.txt")
for (m in 1:M)
if(sum(dat$from==m)>0) {
cat(which(dat$from == m) - 1,"\n") # R to C index conversion
} else cat(-99,"\n")
sink()
sink("input/person_index.txt")
for (k in 1:max(item$pid))
if(sum(dat$id==k)>0) {
cat(which(dat$id == k) - 1,"\n") # R to C index conversion
} else cat(-99,"\n")
sink()
}
We assume a continuously observed process, recurrent actions. The process seems to be irreversible, but I need to double check. See https://www.mrc-bsu.cam.ac.uk/wp-content/uploads/2017/10/multistate_enar_webinar.pdf
The intensity function $qml(⋅)$ represents the instantaneous risk of moving from action
\begin{align*}
qml (t ; \boldsymbol{α}, \boldsymbol{β}, \mathbf{z}(t)) = & λk,m → l(t) exp( α_m + α_l + \boldsymbol{β}m,l’ \mathbf{z}i,m,l(t) ),
\end{align*}
where
What questions can you answer using these models? The below are quantities of interests in the multi-state survival model. All functions of the transition intensities (similar to the fact that estimable quantities are all functions of hazard rates in competing risk analysis) are estimable.
- total time that is expected to spend in state
$l$ before time$t$ . - expected first passage time (first visit time to a state)
- expected number of visits to a state
- if possible, prob. of first action
In Discussion, we decided to use time-scale: in-homogeneous and number of terminal states: one.
(progn (setq default-directory "~/Dropbox/research/procmod/procmod-code/diprom")
(ess-r-devtools-clean-and-rebuild-package))
#!/usr/bin/env bash
# out_dir="party_invitations-1/"
# out_dir="cd_tally/"
# out_dir="sprained_ankle-1/"
# prename="R/party_invitations-1-preprocess.R"
# initname="R/party_invitations-1-init.R"
# first argument = output dir
out_dir=$1
n_chain=2
figlet "running MCMC"
echo "================================"
echo "output dir:" $out_dir
echo "preprocessing:" $prename
echo "initializing:" $initname
echo "n_chain:" $n_chain
echo "================================"
export STAN_NUM_THREADS=2
mkdir -p output
rm output/*
# rm input/*
# Rscript $prename
cp -r input output/
cp run.sh output/
# cp $prename output/
# cp $initname output/
for ((v = 1; v <= $n_chain; v++))
do
# Rscript $initname
./main $v 10000 10000 10 parallel
done
## main function argument
# chain #, iteration, burn-in, thin
# parallel serial -> parallel computation?
mkdir -p $out_dir
mv output/* $out_dir
echo "Outputs are moved to" $out_dir"."
echo "================================"
Rscript R/run-analysis.R $out_dir $n_chain
echo "Post analysis is finished for " $out_dir"."
echo "================================"
#!/usr/bin/env Rscript
args <- commandArgs(trailingOnly = TRUE)
# test if there is at least one argument: if not, return an error
if (length(args) < 1) {
stop("out_dir must be supplied.\n", call. = FALSE)
} else out_dir <- args[1]
if (length(args) == 2) {
num_chain <- args[2]
} else if (length(args) == 1) {
num_chain = NULL
num_chain = get_nchain(out_dir)
}
## out_dir <- "chessB-singleZ-singleW/"
system(paste0("rm figure/*.pdf"))
source("R/analysis.R")
system(paste0("mkdir -p ", out_dir, "figure/"))
system(paste0("rsync -rv figure/*.pdf ", out_dir, "figure/"))
set the output directory. run the chain of analytic tools. move the results to the output dir.
setwd("~/workspace/procmod-code")
num_chain = 2
ldir = c(
"party_invitations-1/",
"party_invitations-2/",
"cd_tally/",
"sprained_ankle-1/",
## "sprained_ankle-2/", ## didn't run
"tickets/",
## "class_attendance/", ## didn't run
"club_membership-1/",
## "club_membership-2/", ## didn't run
"book_order/",
"meeting_room/",
## "reply_all/", ## failed
"locate_email/",
"lamp_return/")
## out_dir="party_invitations-1/"
## out_dir="party_invitations-2/"
## out_dir="cd_tally/"
## out_dir="sprained_ankle-1/"
## out_dir="sprained_ankle-2/"
## out_dir="tickets/"
## out_dir="class_attendance/"
## out_dir="club_membership-1/"
## out_dir="club_membership-2/"
## out_dir="book_order/"
## out_dir="meeting_room/"
## out_dir="reply_all/"
## out_dir="locate_email/"
## out_dir="lamp_return/"
for (out_dir in ldir) {
system(paste0("rm figure/*.pdf"))
system(paste0("mkdir -p figure/"))
source("R/analysis.R")
system(paste0("mkdir -p ", out_dir, "figure/"))
system(paste0("rsync -rv figure/*.pdf ", out_dir,"figure/"))
}
library(diprom)
library(jyunr)
## library(coda)
## library(dplyr)
## library(ggplot2)
## library(stringr)
## library(magrittr)
## library(bayesplot)
## library(foreach)
## library(doParallel)
stopImplicitCluster()
## doParallel::registerDoParallel(2)
registerDoParallel(cores = detectCores() - 1)
This section contains scripts to create an mcmc object. 1) read MCMC samples, 2) set their column names, and 3) Procrustean matching.
set_cnames = function(M = M_, N = N_, dq = dq_, dn = dn_, dc = dc_) {
cnames <- c(".chain", ".iteration")
## CSVFormat prints by row-major order!
for (m in 1:M) {
cnames <- c(cnames, paste0("kappa.", m))
}
for (m in 1:N) {
cnames <- c(cnames, paste0("tau.", m))
}
for (k in 1:N) {
cnames <- c(cnames, paste0("theta.", k))
}
cnames <- c(cnames, "sigma", "beta", "mu_beta", "sigma_beta", "llike_", "lpri_", "lp_")
return(cnames)
}
get_nchain = function(out_dir) {
## use all sample files if nchain is not set
## assign if NULL
if (is.null(num_chain)) {
num_chain <- length(list.files(path = out_dir, pattern="sample_chain[0-9]+\\.csv"))
}
return(num_chain)
}
read_output = function(num_chain, cnames) {
dlist <- list()
for (cid in 1:num_chain) {
dlist[[cid]] <- jyunr::read_csv(paste0(out_dir, "sample_chain", cid, ".csv"), col_names = F, skip = 0) %>% as.data.frame()
colnames(dlist[[cid]]) <- cnames
}
return(dlist)
}
mvar <- jyunr::read_csv(paste0(out_dir, "input/mvar.csv"), col_names = F) %>% as.matrix()
M = M_ = mvar[1]
N = N_ = mvar[2]
dq = dq_ = mvar[3]
dn = dn_ = mvar[4]
dc = dc_ = mvar[5]
cnames = diprom::set_cnames(M, N, dq, dn, dc)
dlist = diprom::read_output(num_chain, cnames)
## unlist
## df <- bind_rows(dlist, .id = "column_label")
mclist <- mcmc.list()
for (cid in 1:num_chain) {
mclist[[cid]] <- mcmc(dlist[[cid]])
}
figure/theta_res.pdf figure/theta_tau_res.pdf figure/tau_res.pdf figure/time_action.pdf figure/time_action_more.pdf
source("R/pmean.R")
<<PUF_filter>>
## dat <- jyunr::read_csv(paste0(out_dir, "input/dat.csv"), col_names = FALSE)
## rr <- dat[, c(1, 8)] %>% as.data.frame()
## rr <- rr[!duplicated(rr), 2]
gg <- item %>%
group_by(SEQID) %>%
summarize(ftime = timestamp[1] / 1000, naction = n(),
time = timestamp[n()] / 1000, spd = naction / time)
## binfo = jyunr::read_csv("./data/PIAAC_cleaned_data_1110/PUFs_noN.csv")
## book_order =minfo= has a fewer SEQID than =item=
gres <- item %>%
group_by(SEQID) %>%
summarize(res = response[n()], time = timestamp[n()] / 1000) %>% mutate(index = 1:n())
ginfo = plyr::join(minfo, gres)
rr = ginfo$res
WIP: stopped here
pid = unique(item$SEQID)
pdf("figure/tau_action.pdf")
pp <- cl_plot(cbind(log(mtau), log(gg$naction)), rr, c("log(tau)", "log(#action)"))
print(pp)
dev.off(which = dev.cur())
pdf("figure/theta_res.pdf")
dd <- data.frame(theta = mtheta, res = rr)
boxp <- ggplot(dd, aes(x = factor(res), y = theta, fill = factor(res))) +
geom_boxplot() +
theme(legend.position = "none")
print(boxp)
dev.off(which = dev.cur())
pdf("figure/tau_res.pdf")
dd <- data.frame(tau = mtau, res = rr)
boxp <- ggplot(dd, aes(x = factor(res), y = tau, fill = factor(res))) +
geom_boxplot() +
theme(legend.position = "none")
print(boxp)
dev.off(which = dev.cur())
pdf("figure/theta_tau_res.pdf")
pp <- cl_plot(cbind(mtheta, log(mtau)), rr, c("theta", "log(tau)"))
print(pp)
dev.off(which = dev.cur())
pdf("figure/time_action_more.pdf")
cl_box(log(gg$time), rr, c("log(time)", "res"))
cl_box(log(gg$ftime / gg$time), rr, c("log(ftime / time)", "res"))
cl_box(log(gg$time - gg$ftime), rr, c("log(time - ftime)", "res"))
cl_box(log(gg$naction / gg$time), rr, c("log(#action / time)", "res"))
pp <- cl_plot(cbind(log(gg$time), log(gg$naction)), rr, c("log(time)", "log(#actions)"))
print(pp)
pp <- cl_plot(cbind(log(gg$time), log(gg$ftime)), rr, c("log(time)", "log(ftime)"))
print(pp)
pp <- cl_plot(cbind(mtheta, log(gg$time)), rr, c("theta", "log(time)"))
print(pp)
pp <- cl_plot(cbind(mtheta, log(gg$time - gg$ftime)), rr, c("theta", "log(time - ftime)"))
print(pp)
pp <- cl_plot(cbind(mtheta, log(gg$naction / gg$time)), rr, c("theta", "log(#action / time)"))
print(pp)
pp <- cl_plot(cbind(mtheta, log(gg$naction / (gg$time - gg$ftime))), rr, c("theta", "log(#action / (time - ftime))"))
print(pp)
pp <- cl_plot(cbind(mtau, log(gg$time)), rr, c("tau", "log(time)"))
print(pp)
pp <- cl_plot(cbind(mtau, log(gg$time - gg$ftime)), rr, c("tau", "log(time - ftime)"))
print(pp)
pp <- cl_plot(cbind(mtau, log(gg$naction / gg$time)), rr, c("tau", "log(#action / time)"))
print(pp)
pp <- cl_plot(cbind(mtau, log(gg$naction / (gg$time - gg$ftime))), rr, c("tau", "log(#action / (time - ftime))"))
print(pp)
dev.off(which = dev.cur())
- first action time and number of actions
pdf("figure/time_action.pdf")
cl_box(log(gg$ftime), rr, c("log(first_action_time)", "res"))
cl_box(log(gg$naction), rr, c("log(naction)", "res"))
pp <- cl_plot(cbind(mtheta, log(gg$ftime)), rr, c("theta", "log(first_action_time)"))
print(pp)
pp <- cl_plot(cbind(mtheta, log(gg$naction)), rr, c("theta", "log(#actions)"))
print(pp)
pp <- cl_plot(cbind(mtau, log(gg$ftime)), rr, c("tau", "log(first_action_time)"))
print(pp)
pp <- cl_plot(cbind(mtau, log(gg$naction)), rr, c("tau", "log(#actions)"))
print(pp)
pp <- cl_plot(cbind(log(gg$ftime), log(gg$naction)), rr, c("log(first_action_time)", "log(#action)"))
print(pp)
dev.off(which = dev.cur())
## sink("output/mcmc_summary.txt")
cat("==================================")
cat("Rejection Rate")
cat("==================================")
rejectionRate(mclist)
cat("==================================")
cat("Effective Size")
cat("==================================")
effectiveSize(mclist)
cat("==================================")
cat("Summary")
cat("==================================")
summary(mclist)
## sink()
library(diprom)
ldir = c(
"party_invitations-1/",
"party_invitations-2/",
"cd_tally/",
"sprained_ankle-1/",
## "sprained_ankle-2/", ## didn't run
"tickets/",
## "class_attendance/", ## didn't run
"club_membership-1/",
## "club_membership-2/", ## didn't run
"book_order/",
"meeting_room/",
## "reply_all/", ## failed
"locate_email/",
"lamp_return/")
for (out_dir in ldir) {
fwil_path = paste0(out_dir, "wilcoxon.txt")
if (file.exists(fwil_path)) {
##Delete file if it exists
file.remove(fwil_path)
}
source("R/pmean.R")
source("R/pinfo_preproc.R")
sink(fwil_path)
x = minfo$tau[minfo$GENDER_R == 1]
y = minfo$tau[minfo$GENDER_R == 2]
cat(wilcox.test(x, y, alternative = "two.sided")$p.value)
cat("\n")
x = minfo$theta[minfo$GENDER_R == 1]
y = minfo$theta[minfo$GENDER_R == 2]
cat(wilcox.test(x, y, alternative = "two.sided")$p.value)
sink()
}
for (out_dir in ldir) {
cat(out_dir,"\n")
pp = read.table(paste0(out_dir, "wilcoxon.txt"))
cat(unlist(pp),"\n")
cat("#############\n")
}
## out_dir="party_invitations-1/"
## out_dir="party_invitations-2/"
## out_dir="cd_tally/"
## out_dir="sprained_ankle-1/"
## out_dir="sprained_ankle-2/"
## out_dir="tickets/"
## out_dir="class_attendance/"
## out_dir="club_membership-1/"
## out_dir="club_membership-2/"
## out_dir="book_order/"
## out_dir="meeting_room/"
## out_dir="reply_all/"
## out_dir="locate_email/"
## out_dir="lamp_return/"
num_chain = 2
stopImplicitCluster()
## doParallel::registerDoParallel(2)
registerDoParallel(cores = detectCores() - 1)
for (out_dir in ldir) {
lname = list(paste0(out_dir,"tau_imp.txt"), paste0(out_dir,"theta_imp.txt"))
source("R/pmean.R")
source("R/pinfo_preproc.R")
res = list(tau = minfo$tau, theta = minfo$theta)
for (kk in 1:2) {
fname = lname[[kk]]
if (file.exists(fname)) {
##Delete file if it exists
file.remove(fname)
}
fcon = file(fname)
y = res[[kk]]
## source("R/tune_ranger.R")
source("R/train_ranger.R")
source("R/train_pimp_rf.R")
## source("R/retrain_pimp_rf.R")
fstr = print(imp[,-3])
writeLines(knitr::kable(fstr), fcon)
close(fcon)
}
}
<<lpa_prep>>
for (out_dir in ldir) {
<<lpa_compare>>
}
library(diprom)
library(jyunr)
library(tidyLPA)
ldir = c(
"party_invitations-1/",
"party_invitations-2/",
"cd_tally/",
"sprained_ankle-1/",
## "sprained_ankle-2/", ## didn't run
"tickets/",
## "class_attendance/", ## didn't run
"club_membership-1/",
## "club_membership-2/", ## didn't run
"book_order/",
"meeting_room/",
## "reply_all/", ## failed
"locate_email/",
"lamp_return/"
)
## out_dir="party_invitations-1/"
## out_dir="party_invitations-2/"
## out_dir="cd_tally/"
## out_dir="sprained_ankle-1/"
## out_dir="sprained_ankle-2/"
## out_dir="tickets/"
## out_dir="class_attendance/"
## out_dir="club_membership-1/"
## out_dir="club_membership-2/"
## out_dir="book_order/"
## out_dir="meeting_room/"
## out_dir="reply_all/"
## out_dir="locate_email/"
## out_dir="lamp_return/"
num_chain = 2
stopImplicitCluster()
## doParallel::registerDoParallel(2)
registerDoParallel(cores = detectCores() - 1)
flpa_path = paste0(out_dir, "lpa_mods.txt")
if (file.exists(flpa_path)) {
##Delete file if it exists
file.remove(flpa_path)
}
<<get_traits>>
ainfo = plyr::join(minfo, gg)
ainfo = ainfo %>% mutate(ltau = log(tau), laction = log(naction))
mod1 = ainfo %>%
select(tau, theta) %>%
single_imputation() %>%
scale() %>%
estimate_profiles(1:4,
variances = c( "varying"),
covariances = c( "varying")) %>%
compare_solutions(statistics = c("AIC","AWE", "BIC", "CLC", "KIC"))
mod2 = ainfo %>%
select(tau, theta, naction, spd) %>%
single_imputation() %>%
scale() %>%
estimate_profiles(1:4,
variances = c( "varying"),
covariances = c( "varying")) %>%
compare_solutions(statistics = c("AIC","AWE", "BIC", "CLC", "KIC"))
mod3 = ainfo %>%
select(naction, spd) %>%
single_imputation() %>%
scale() %>%
estimate_profiles(1:4,
variances = c( "varying"),
covariances = c( "varying")) %>%
compare_solutions(statistics = c("AIC","AWE", "BIC", "CLC", "KIC"))
sink(flpa_path)
cat("\n--------------------------------------------------------------\n")
cat("tau and theta")
cat("\n--------------------------------------------------------------\n")
print(mod1)
cat("\n--------------------------------------------------------------\n")
cat("tau and theta + naction, spd")
cat("\n--------------------------------------------------------------\n")
print(mod2)
cat("\n--------------------------------------------------------------\n")
cat("naction, spd")
cat("\n--------------------------------------------------------------\n")
print(mod3)
sink()
## mod1 = ainfo %>%
## select(ltau, theta) %>%
## single_imputation() %>%
## estimate_profiles(3, variances = "varying", covariances = "varying")
## mod2 = ainfo %>%
## select(ltau, theta, laction, spd) %>%
## single_imputation() %>%
## estimate_profiles(2, variances = c("varying"), covariances = c("zero"))
## mod3 = ainfo %>%
## select(laction, spd) %>%
## single_imputation() %>%
## estimate_profiles(2, variances = "varying", covariances = "varying")
pdf("figure/param_age.pdf")
dat <- jyunr::read_csv(paste0(out_dir, "input/dat.csv"), col_names = FALSE)
rr <- dat[, c(1, 8)] %>% as.data.frame()
rr <- rr[!duplicated(rr), 2]
gg = cl_plot(cbind(minfo$AGEG5LFS, minfo$tau), rr, c("age","tau"), xlim=c(0,max(minfo$AGEG5LFS)), ylim = c(0, max( minfo$tau) ))
print(gg)
gg = cl_plot(cbind(minfo$AGEG5LFS, minfo$theta), rr, c("age","theta"), xlim=c(0,max(minfo$AGEG5LFS)), ylim = c(0, max( minfo$theta) ))
print(gg)
dev.off()
var = jyunr::read_csv("./data/PIAAC_cleaned_data_1110/PUF_Variables.csv")
var = var %>% filter(Domain %in% c("Sampling / weighting", "Not assigned" ,"Sampling / weighting (derived)", "Background questionnaire (trend)" ,"Background questionnaire", "Background questionnaire (derived)"))
param = data.frame(SEQID = unique(item$SEQID), theta = mtheta$mean, tau = mtau$mean)
pinfo = jyunr::read_csv("./data/PIAAC_cleaned_data_1110/PUFs_spss.csv")
pinfo = pinfo %>% select(SEQID) %>% cbind(pinfo[,names(pinfo) %in% toupper(var$Name)])
minfo = plyr::join(param, pinfo, by = 'SEQID', type = "inner")
dx = minfo[,4:ncol(minfo)]
xtrain <- model.matrix(~ 0 + ., data = dx)
ytrain <- y
weights <- rep(1, length(y)) / length(y)
set.seed(1)
nfold <- 4
shu <- sample(1:length(ytrain))
xtrain <- xtrain[shu, ]
ytrain <- ytrain[shu]
data_folds <- caret::createFolds(ytrain, k = nfold)
An R package ranger is used to train the random forest. An R pacakge caret is a wrapper of many R packages, which we will use for training. The below is caret’s model training parameters.
my_tr <- trainControl(
method = "cv",
number = nfold,
## classProbs = TRUE,
savePredictions = "all",
## summaryFunction = twoClassSummary, # AUC
## ,summaryFunction = prSummary # PR-AUC
## ,summaryFunction = fSummary # F1
## summaryFunction = mnLogLoss,
search = "random",
verboseIter = TRUE,
allowParallel = TRUE,
indexOut = data_folds
)
Unlike CatBoost or XGBoost, ranger doesn’t have an internal handling mecahnism of missing values.
## imputation needs for ranger
## ixtrain <- xtrain
## ixtrain[is.na(ixtrain)] <- -99
## above50k needs to be "positive"
## caret considers 1st class as "positive" class
## fytrain <- factor(ytrain)
## levels(fytrain) <- c("no_cor", "cor")
- Tuned hyperparameters of ranger.
ranger_grid <- expand.grid(
mtry = c(5,10,15,20,25,30),
splitrule = "variance",
min.node.size = c(5,10,15,20)
)
The below is for CV and saving a final model.
set.seed(1)
ranger_tune <- train(
x = xtrain, y = ytrain,
method = "ranger",
trControl = my_tr,
tuneGrid = ranger_grid,
preProc = NULL,
importance = "impurity",
num.trees = 500)
temp <- ranger_tune$pred$co
ranger_id <- ranger_tune$pred$rowIndex
ranger_prob <- temp[order(ranger_id)]
ranger_final <- ranger_tune$finalModel
ranger_imp <- varImp(ranger_tune)$importance
ranger
gives the confusion matrix if probability
is set to FALSE
.
set.seed(9999)
mydd = data.frame(y=ytrain, xtrain)
## to calulate importance p-value based on purmutation
ranger_ff <- ranger(y~.,data = mydd,
num.trees = 500, mtry = 30,
## importance = "impurity_corrected",
importance = "permutation",
## importance = "hold-out",
## importance = "impurity",
write.forest = TRUE,
## probability = TRUE,
min.node.size = 5,
class.weights = NULL, splitrule = "variance", classification = FALSE,
seed = 99
)
## ranger_ho <- holdoutRF(y~.,data = mydd,
## num.trees = 500, mtry = 30,
## ## importance = "impurity_corrected",
## ## importance = "permutation",
## ## importance = "hold-out",
## ## importance = "impurity",
## write.forest = TRUE,
## ## probability = TRUE,
## min.node.size = 5,
## class.weights = NULL, splitrule = "variance", classification = FALSE,
## seed = 99
## )
ranger_imp <- ranger_ff$variable.importance
rtemp <- data.frame(ranger_imp = ranger_imp)
row.names(rtemp) <- names(ranger_imp)
rrtemp <- rtemp %>%
arrange(-ranger_imp) # %>% slice(1:15)
rrtemp[,1] = rrtemp[,1] / rrtemp[1,1] * 100
vimp = g10 = rrtemp %>% filter(ranger_imp > 25)
vimp = data.frame(Name = row.names(vimp), imp = vimp)
tmp = var %>% filter(Name %in% vimp$Name) %>% select(Name,Label, 'Value scheme')
vimp = plyr::join(tmp, vimp)
vimp[,-3]
g10 = vimp %>% filter(!(Name %in% c("AGEG10LFS","AGEG10LFS_T"))) %>% select(Name, ranger_imp)
set.seed(9999)
mydd = mydd %>% select(y, g10$Name)
ranger_ff <- ranger(y~.,data = mydd,
num.trees = 500, mtry = min(10,nrow(g10)),
## importance = "impurity_corrected",
importance = "permutation",
## importance = "hold-out",
## importance = "impurity",
write.forest = TRUE,
## probability = TRUE,
min.node.size = 5,
class.weights = NULL, splitrule = "variance", classification = FALSE,
seed = 99
)
im_pp = importance_pvalues(ranger_ff, method = "altmann", formula = y ~ ., data = mydd,
num.permutations = 100)
imp = data.frame(Name = row.names(im_pp), imp = im_pp[,1], pval = im_pp[,2])
tmp = var %>% filter(Name %in% imp$Name) %>% select(Name, Label, 'Value scheme')
imp = plyr::join(tmp, imp) %>% arrange(pval)
imp[,-3]
pimp = sort(im_pp[,2])
pimp[pimp < 0.05] %>% knitr::kable()
pimp_vv = names(pimp)[pimp < 0.05]
| | x| |:--------------|--------:| |AGEG5LFS | 0.009901| |ICTHOME | 0.009901| |J_Q08 | 0.019802| |ICTHOME_WLE_CA | 0.019802| |EDCAT7 | 0.039604| |WRITHOME | 0.049505|
mydd = data.frame(y=ytrain, xtrain[, pimp_vv])
set.seed(1)
## pimp_rf_tune <- train(
## y = ytrain, x = xtrain[, pimp_vv],
## method = "ranger",
## trControl = my_tr,
## tuneGrid = expand.grid(
## mtry = c(2,4),
## splitrule = "variance",
## min.node.size = c(5,10,15,20)),
## ## weights = weights,
## preProc = NULL,
## importance = "impurity",
## num.trees = 500)
## to calulate importance p-value based on purmutation
pimp_rf <- ranger(y~.,data = mydd,
num.trees = 500, mtry = min(4,ncol(mydd)-1),
## importance = "impurity_corrected",
importance = "permutation",
## importance = "hold-out",
## importance = "impurity",
write.forest = TRUE,
## probability = TRUE,
min.node.size = 5,
class.weights = NULL, splitrule = "variance", classification = FALSE,
seed = 99
)
vimp = pimp_rf$variable.importance
vimp = data.frame(Name = names(vimp), imp = vimp)
tmp = var %>% filter(Name %in% names(pimp_rf$variable.importance)) %>% select(Name,Label, 'Value scheme') %>% mutate(imp = pimp_rf$variable.importance)
vimp = plyr::join(tmp, vimp)
vimp[,-3]
Joining by: Name, imp Name 1 J_Q08 2 AGEG5LFS 3 EDCAT7 4 ICTHOME 5 ICTHOME_WLE_CA 6 WRITHOME Label 1 Background - Number of books at home 2 Age groups in 5-year intervals based on LFS groupings (derived) 3 Highest level of formal education obtained (7 categories - derived) 4 Index of use of ICT skills at home (derived) 5 Index of use of ICT skills at home, categorised WLE (derived) 6 Index of use of writing skills at home (derived) imp 1 0.09185096 2 0.15897985 3 0.03301487 4 0.07380785 5 0.04287672 6 0.03834553
saveRDS(pimp_rf, "cor_ranger_pimp_rf.rds")
saveRDS(ranger_ff, "cor_ranger_rural_ff.rds")
The below is to save urban prediction model. the predicted probablity will be used for cor prediction!
saveRDS(ranger_final, "cor_ranger_pred_rural.rds")
caret_wlogloss(ranger_tune$pred)
mean(ranger_tune$resample[, 1]) # incorrect. not using weight
The below is code to perform LPA using optimal number of clusters (determined from prior procedure based on AIC, BIC, etc.)
<<lpa_prep>>
for (out_dir in ldir) {
cathead(out_dir)
<<draw_lpa_fig>>
mod = mod2
mod$name = "2"
<<draw_lpa_back>>
<<draw_lpa_back_vio>>
mod = mod3
mod$name = "3"
<<draw_lpa_back>>
<<draw_lpa_back_vio>>
}
## nc2 = c(4,4,3)
## nc3 = c(2,3,2)
## cd_tally, book_order are winning for AIC and BIC
## "club_membership-1/": 3 3
nc2 = c(4,4,4,4,4,3,4,4,4,4)
nc3 = c(4,4,3,4,4,3,3,3,4,3)
## nc2 = c(4,4,4)
## nc3 = c(4,4,3)
kk = which(out_dir == ldir)
lname = list(paste0(out_dir,"tau_imp.txt"), paste0(out_dir,"theta_imp.txt"))
<<get_traits>>
res = list(tau = minfo$tau, theta = minfo$theta)
ainfo = plyr::join(minfo, gg)
ainfo = ainfo %>% mutate(ltau = log(tau), laction = log(naction))
mod2 = ainfo %>%
select(tau, theta, naction, spd) %>%
single_imputation() %>%
scale() %>%
estimate_profiles(nc2[kk],
variances = c( "varying"),
covariances = c( "varying"))
mod3 = ainfo %>%
select(naction, spd) %>%
single_imputation() %>%
scale() %>%
estimate_profiles(nc3[kk],
variances = c( "varying"),
covariances = c( "varying"))
## pdf(paste0(out_dir,"figure/lpa_plot.pdf"))
## mod2 %>% plot_profiles()
## mod3 %>% plot_profiles()
## dev.off()
## source("R/lpa_back.R")
## library(gridExtra)
<<background_vname>>
<<background_df>>
prob = numeric(nrow(tt))
res = scale(ginfo$res)
time = scale(ginfo$time)
for (kk in 1:length(prob))
prob[kk] = tt[kk,paste0('CPROB',tt$Class[kk])]
tmp = tt %>% mutate(cluster = Class, prob = prob, time = time) %>% select(c(3:6),ftime, time, prob, cluster)
df_long = clust2long(tmp, `tau` = "tau", `theta` = "theta", `naction` = "N[actions]", `spd` = "speed", `ftime` = "ftime", `time` = "time")
b1 = box_clust(df_long, 1, "bottom", grid = "Y",
axis_title_size = 15, axis_text = "y",
plot_margin = margin(t = 10, r = 0, b = 0, l = 0),
axis_title_just = "Fm", panel_border = T) +
scale_x_continuous(breaks = tmp$cluster) +
scale_y_continuous(limits = c(-4, 4),
##labels = c(rep("",1),-2,0,2,rep("",1)),
expand = c(0, 0)) +
ylab("Standardized value") +
xlab("") +
scale_color_tab20c(1) + scale_fill_tab20c(4)
## fsize = set_fig_size(width = 'psychometrikaV2')
## ggsave(file=paste0(out_dir,"figure/lpa_box4.pdf"),width=fsize[1],height=fsize[2])
tmp = tt %>% mutate(cluster = Class, prob = prob, res = res) %>% select(res, c(7:11), prob, cluster)
df_long = clust2long(tmp, `res` = "response", `AGEG5LFS` = "Age", `ICTHOME` = "ICT", `READHOME` = "Read", `WRITHOME` = "Write", `NUMHOME` = "Numeracy")
b2 = box_clust(df_long, 1, "bottom", grid = "Y",
axis_title_size = 15, axis_text = "y",
plot_margin = margin(t = 10, r = 0, b = 0, l = 0),
axis_title_just = "Fm", panel_border = T) +
scale_x_continuous(breaks = tmp$cluster) +
scale_y_continuous(limits = c(-4, 4),
##labels = c(rep("",1),-2,0,2,rep("",1)),
expand = c(0, 0)) +
ylab("Standardized value") +
xlab("") +
scale_color_tab20c(1) + scale_fill_tab20c(4)
## ggsave(file="bb.pdf",width=7,height=7,family="sans")
## https://disq.us/p/2940ij3
## Golden ratio to set aesthetic figure height
fsize = set_fig_size(width = 'psychometrikaV2')
## ggsave(file=paste0(out_dir,"figure/lpa_box_back.pdf"),width=fsize[1],height=fsize[2])
pdf(file=paste0(out_dir,"figure/lpa_box_all_",mod$name,".pdf"),width=fsize[1],height=fsize[2])
gridExtra::grid.arrange(b1,b2,nrow=2)
dev.off()
## source("R/lpa_back.R")
## library(gridExtra)
<<background_vname>>
<<background_df>>
prob = numeric(nrow(tt))
res = scale(ginfo$res)
time = scale(ginfo$time)
for (kk in 1:length(prob))
prob[kk] = tt[kk,paste0('CPROB',tt$Class[kk])]
tmp = tt %>% mutate(cluster = Class, prob = prob, time = time) %>% select(c(3:6),ftime, time, prob, cluster)
df_long = clust2long(tmp, `tau` = "tau", `theta` = "theta", `naction` = "N[actions]", `spd` = "speed", `ftime` = "ftime", `time` = "time")
v1 = vio_clust(df_long, 1, "bottom", grid = "Y",
axis_title_size = 15, axis_text = "y",
plot_margin = margin(t = 10, r = 0, b = 0, l = 0),
axis_title_just = "Fm", panel_border = T) +
scale_x_continuous(breaks = tmp$cluster) +
scale_y_continuous(limits = c(-4, 4),
##labels = c(rep("",1),-2,0,2,rep("",1)),
expand = c(0, 0)) +
ylab("Standardized value") +
xlab("") +
scale_color_tab20c(1) + scale_fill_tab20c(4)
## fsize = set_fig_size(width = 'psychometrikaV2')
## ggsave(file=paste0(out_dir,"figure/lpa_box4.pdf"),width=fsize[1],height=fsize[2])
tmp = tt %>% mutate(cluster = Class, prob = prob, res = res) %>% select(res, c(7:11), prob, cluster)
df_long = clust2long(tmp, `res` = "response", `AGEG5LFS` = "Age", `ICTHOME` = "ICT", `READHOME` = "Read", `WRITHOME` = "Write", `NUMHOME` = "Numeracy")
v2 = vio_clust(df_long, 1, "bottom", grid = "Y",
axis_title_size = 15, axis_text = "y",
plot_margin = margin(t = 10, r = 0, b = 0, l = 0),
axis_title_just = "Fm", panel_border = T) +
scale_x_continuous(breaks = tmp$cluster) +
scale_y_continuous(limits = c(-4, 4),
##labels = c(rep("",1),-2,0,2,rep("",1)),
expand = c(0, 0)) +
ylab("Standardized value") +
xlab("") +
scale_color_tab20c(1) + scale_fill_tab20c(4)
## ggsave(file="bb.pdf",width=7,height=7,family="sans")
## https://disq.us/p/2940ij3
## Golden ratio to set aesthetic figure height
fsize = set_fig_size(width = 'psychometrikaV2')
## ggsave(file=paste0(out_dir,"figure/lpa_box_back.pdf"),width=fsize[1],height=fsize[2])
pdf(file=paste0(out_dir,"figure/lpa_vio_all_",mod$name,".pdf"),width=fsize[1],height=fsize[2])
gridExtra::grid.arrange(v1,v2,nrow=2)
dev.off()
tmp = tt %>% mutate(cluster = Class, prob = prob) %>% select(c(3:6), prob, cluster)
##plot2d(tmp[,1:2], tmp$cluster) +
##
bb = tmp %>% select(tau, theta, cluster)
gg <- ggplot(bb, aes(x = theta, y = tau,
colour = factor(cluster), shape = factor(cluster))) +
geom_point(size = 1.5, alpha = 0.7) +
theme_base(grid = "XY")+
theme_legend() +
labs(colour = "Cluster", shape = "Cluster") +
scale_x_continuous(expand = c(0, 0)) +
scale_y_continuous(expand = c(0, 0)) +
xlab(expression(theta))
bb
## ggsave(file="bb.pdf",width=7,height=7,family="sans")
## https://disq.us/p/2940ij3
## Golden ratio to set aesthetic figure height
## fsize = set_fig_size(width = 'psychometrikaV2')
## ggsave(file=paste0(out_dir,"figure/lpa_box_back.pdf"),width=fsize[1],height=fsize[2])
mod = mod2
mod$name = "2"
bf = ainfo %>% select(AGEG5LFS, ICTHOME, READHOME, WRITHOME, NUMHOME, NFEHRS, ICTWORK, READWORK, WRITWORK, NUMWORK, INFLUENCE, EARNHRDCL, TASKDISC, LEARNATWORK)
bname = names(bf)
<<PUF_filter>>
fname = "back_var.txt"
toprint = var %>% filter(Name %in% bname) %>% select(Name, Label, 'Value scheme')
kable2file(toprint, fname)
bf[bf==999999999999] = NA ## back to NA
bf = bf %>%
single_imputation() %>%
scale() %>% as.data.frame
ftime = ainfo$ftime %>% scale()
df = mod[[1]]
before = df[["dff"]][,1:6]
after = df[["dff"]][,-c(1:6)]
df[["dff"]] = before %>% mutate(AGEG5LFS = bf$AGEG5LFS, ICTHOME = bf$ICTHOME, READHOME = bf$READHOME, WRITHOME = bf$WRITHOME, NUMHOME = bf$NUMHOME, NFEHRS = bf$NFEHRS, ICTWORK = bf$ICTWORK, READWORK = bf$READWORK, WRITWORK = bf$WRITWORK, NUMWORK = bf$NUMWORK, INFLUENCE = bf$INFLUENCE, EARNHRDCL = bf$EARNHRDCL, TASKDISC = bf$TASKDISC, LEARNATWORK = bf$LEARNATWORK) %>% cbind(after) %>% mutate(ftime)
variables = c("tau", "theta", "naction", "spd", bname)
tt = df[["dff"]]
mm = aggregate(tt, list(tt$Class), FUN=mean)
sd = aggregate(tt, list(tt$Class), FUN=sd)
n = aggregate(tt, list(tt$Class), FUN=length)
se = sd / sqrt(n)
df_raw = diprom:::.get_long_data(df)
x = mod2; ci = .95; sd = TRUE; add_line = FALSE; rawdata = TRUE; bw = FALSE; alpha_range = c(0, .1)
df_plot <- get_estimates(x)
names(df_plot)[match(c("Estimate", "Parameter"), names(df_plot))] <- c("Value", "Variable")
df_plot$Class <- ordered(df_plot$Class)
if(!"Classes" %in% names(df_plot)){
df_plot$Classes <- length(unique(df_plot$Class))
}
# Drop useless stuff
df_plot <- df_plot[grepl("(^Means$|^Variances$)", df_plot$Category),
-match(c("p"), names(df_plot))]
df_plot$Variable <- ordered(df_plot$Variable, levels = unique(df_plot$Variable))
# Select only requested variables, or else, all variables
if (!is.null(variables)) {
df_plot <- df_plot[tolower(df_plot$Variable) %in% tolower(variables), ]
}
df_plot$Variable <- droplevels(df_plot$Variable)
## variables <- levels(df_plot$Variable)
df_plot$idvar <- paste0(df_plot$Model, df_plot$Classes, df_plot$Class, df_plot$Variable)
df_plot <- reshape(data.frame(df_plot), idvar = "idvar", timevar = "Category", v.names = c("Value", "se"), direction = "wide")
df_plot <- df_plot[, -match("idvar", names(df_plot))]
# Get some classy names
names(df_plot) <- gsub("\\.Means", "", names(df_plot))
dd = df_plot
for (nn in bname) {
for (ii in unique(df_plot$Class)) {
dd = rbind(dd,data.frame(Variable = nn, Class = ii, Model = dd[1,3], Classes = dd[1,4], Value = mm[ii,nn], se = se[ii,nn], Value.Variances = dd[1,7],
se.Variances = dd[1,8]))
}}
df_plot = dd
variables <- levels(df_plot$Variable)
df_raw <- df_raw[, c("model_number", "classes_number", variables, "Class", "Class_prob", "Probability", "id")]
df_raw$Class <- ordered(df_raw$Class_prob, levels = levels(df_plot$Class))
variable_names <- paste("Value", names(df_raw)[-c(1,2, ncol(df_raw)-c(0:3))], sep = "...")
names(df_raw)[-c(1,2, ncol(df_raw)-c(0:3))] <- variable_names
df_raw <- reshape(
df_raw,
varying = c(Variable = variable_names),
idvar = "new_id",
direction = "long",
timevar = "Variable",
sep = "..."
)
if(any(c("Class_prob", "id", "new_id") %in% names(df_raw))){
df_raw <- df_raw[, -which(names(df_raw) %in% c("Class_prob", "id", "new_id"))]
}
df_raw$Variable <- ordered(df_raw$Variable,
levels = levels(df_plot$Variable))
names(df_raw)[c(1,2)] <- c("Model", "Classes")
classplot = plot_profiles.default(x = list(df_plot = df_plot, df_raw = df_raw))
<<lpa_prep>>
for (out_dir in ldir) {
cathead(out_dir)
<<draw_lpa_fig>>
mod = mod2
mod$name = "2"
<<background_vname>>
<<background_df>>
<<lpa_membership_res>>
mod = mod3
mod$name = "3"
<<background_vname>>
<<background_df>>
<<lpa_membership_res>>
}
memb <- data.frame(tt,SEQID = ginfo$SEQID, index = ginfo$index, res = ginfo$res)
save(memb, file = paste0(out_dir, "lpa_membership_mod",mod$name,".RData"))
memb_tt = tt %>% mutate(res = ginfo$res)
mm = aggregate(memb_tt, list(memb_tt$Class), FUN=mean)
sd = aggregate(memb_tt, list(memb_tt$Class), FUN=sd)
n = aggregate(memb_tt, list(memb_tt$Class), FUN=length)
se = sd / sqrt(n)
n = n[,4]
mm = mm %>% select(4:7,res)
sd = sd %>% select(4:7,res)
mm$n = n;
sd$n = 0*n;
save(mm,sd,n, file = paste0(out_dir, "summaryw_res_mod",mod$name,".RData"))
mat2mean_sd(mm,sd,denote_sd = "paren", markup = "markdown") %>%
knitr::kable(align = "r")
plot_intervals = function(param, trans_ = "log", save_plot = TRUE){
regexp = paste0("^",param,"\\.")
if (save_plot) pdf(paste0("figure/",param,"_mcmc_intervals.pdf"))
p0 <- bayesplot::mcmc_intervals(
mclist,
regex_pars = regexp,
transformations = trans_
)
## mcmc_areas(
## lambda0.sam,
## prob = 0.8, # 80% intervals
## prob_outer = 0.99, # 99%
## point_est = "mean"
## )
print(p0)
if (save_plot) dev.off(which = dev.cur())
}
plot_parcoord = function(param, trans_ = "log", save_plot = TRUE){
regexp = paste0("^",param,"\\.")
if (save_plot) pdf(paste0("figure/",param,"_mcmc_parcoord.pdf"))
p0 <- bayesplot::mcmc_parcoord(
mclist,
regex_pars = regexp,
transformations = trans_
)
## mcmc_areas(
## lambda0.sam,
## prob = 0.8, # 80% intervals
## prob_outer = 0.99, # 99%
## point_est = "mean"
## )
print(p0)
if (save_plot) dev.off(which = dev.cur())
}
plot_trace = function(param, max_, trans_ = "log", save_plot = TRUE){
if (save_plot) pdf(paste0("figure/",param,"_mcmc_trace.pdf"))
regexp = paste0("^",param,"\\.[0-9]$")
for (ii in 0:floor(max_/10)) {
if (ii > 0) regexp = paste0("^",param,"\\.", ii, "[0-9]$")
p <- bayesplot::mcmc_trace(
mclist,
regex_pars = regexp,
transformations = trans_,
facet_args = list(nrow = 4, labeller = label_parsed)
)
print(p <- p + bayesplot::facet_text(size = 10))
## mcmc_areas(
## lambda0.sam,
## prob = 0.8, # 80% intervals
## prob_outer = 0.99, # 99%
## point_est = "mean"
## )
}
if (save_plot) dev.off(which = dev.cur())
}
figure/theta_mcmc_intervals.pdf figure/tau_mcmc_intervals.pdf figure/theta_mcmc_intervals.pdf
diprom::plot_intervals("kappa")
diprom::plot_intervals("tau")
diprom::plot_intervals("theta", list())
figure/theta_mcmc_parcoord.pdf figure/tau_mcmc_parcoord.pdf figure/theta_mcmc_parcoord.pdf
diprom::plot_parcoord("kappa")
diprom::plot_parcoord("tau")
diprom::plot_parcoord("theta", list())
figure/kappa_mcmc_trace.pdf figure/tau_mcmc_trace.pdf figure/theta_mcmc_trace.pdf
bayesplot::color_scheme_set("mix-blue-pink")
diprom::plot_trace("kappa", M)
diprom::plot_trace("tau", N)
diprom::plot_trace("theta", 20, list())
pdf("figure/other_params_trace.pdf")
bayesplot::color_scheme_set("mix-blue-pink")
p <- bayesplot::mcmc_trace(mclist[[1]],
pars = c("sigma", "beta", "mu_beta", "sigma_beta", "llike_", "lpri_", "lp_"),
facet_args = list(nrow = 3, labeller = label_parsed)
)
print(p <- p + bayesplot::facet_text(size = 10))
dev.off(which = dev.cur())
word2vec
- [-] remove
event_description
with no information - [-] remove consecutive action repetition.
- [X] concat
event_type
andevent_description
- [X] substitue SPACE with “_”
- [X] concat actions into a sequence
- [X] remove START and END.
- [X] write sequences to a txt file
source("email_word2vec_preproc.r")
- keep a small number of event_description that are highly relevant to movement of emails.
- remove action taking < 50ms (How fast is human perception? About 80 ms | Find out more)
library(dplyr)
library(stringr)
setwd('~/workspace/procmod-code/')
load('./data/PIAAC_cleaned_data_1110/Problem_solving/Problem-solving_no_missing.rdata')
email = PS %>% filter(CODEBOOK == "U01a000S")
## core_event = c("MAIL_DRAG", "MAIL_MOVED")
core_event = c("MAIL_DRAG", "MAIL_MOVED", "MAIL_VIEWED")
## core_event = c("MAIL_MOVED")
email$event_description[!(email$event_type %in% core_event)] <- ""
## email$event_description <- ""
timestamp = email$timestamp
diff = c(diff(timestamp), 99999)
email$diff = diff
email = email[(diff > 99) || (diff < 0 ), ]
email$event_description = stringr::str_replace(email$event_description, "(.*)\\*\\$target=u01a_(.*)", "\\1\\2")
email$event_description = stringr::str_replace(email$event_description, "id=u01a_", "")
email$event_description = stringr::str_replace(email$event_description, "\\*\\$target=", "")
email = email %>% mutate(event_description = ifelse(event_type == "START","",event_description)) %>%
mutate(event_description = ifelse(event_type == "END","",event_description)) %>%
mutate(event_description = ifelse(event_type == "KEYPRESS","",event_description)) %>%
mutate(event_concat = ifelse(event_description == "", event_type, paste0(event_type,"-",event_description))) %>%
mutate(word = gsub(" ", "_", event_concat))
we need to remove transition between the same action
ww = email$word
id = email$SEQID
ww0 = ww[1:(length(ww)-1)]
ww1 = ww[2:(length(ww))]
dup = ww0 == ww1
dup0 = c(dup, FALSE )
dup1 = c( FALSE , dup)
idx = dup1
cw = "NULL"
cid = 9999999999
for (kk in which(dup1)) {
if(cw != ww[kk] && cid != id[kk]) idx[kk] = FALSE
cw = ww[kk]
cid = id[kk]
}
email = email[!idx, ]
n = nrow(email)
pre = email$word[1:(n-1)]
cur = email$word[2:n]
dif = c(TRUE, !(cur == pre))
idx = logical(n)
for (i in 2:n) {
if(dif[i] == FALSE && dif[i-1] == FALSE) {
idx[i] = TRUE
}
}
email = email[!idx,]
id = unique(email$SEQID)
seqs = character(length(id))
for (i in id) {
## for (word in email$word[email$SEQID == i]) {
## seqs[i] = paste0(seqs[i], " ", word)
## }
seqs[i] = paste(email$word[email$SEQID == i] , collapse = " ")
}
this leave only one action in some cases… let’s keep start and end
## for (i in id) {
## seqs[i] = gsub("START ", "", seqs[i])
## seqs[i] = gsub(" END", "", seqs[i])
## }
seqs = seqs[id]
data.table::fwrite(as.data.frame(seqs), "email_sentence.txt", col.names=F)
vv = numeric(max(id))
for (i in id) {
vv[i] =length(email$word[email$SEQID == i])
}
length(unique(email$word))
n = nrow(email)
trans = paste(email$word[1:(n-1)], "->", email$word[2:n])
trans = trans[trans != "END -> START"]
length(unique(trans))
90 * 89 / 2
ee = PS %>% filter(CODEBOOK == "U01a000S")
ww = paste(ee$event_type, ee$event_description)
length(unique(ww))
nn = nrow(ee)
trans = paste(ww[1:(nn-1)], "->", ww[2:nn])
length(unique(trans))
window_size = 2 should be [5,20] for a small data set. (negative samples) num_ns = 4 See email_word2vec.ipynb Download the vectors.tsv and metadata.tsv to analyze the obtained embeddings in the Embedding Projector. https://projector.tensorflow.org/
source("email_word2vec_preproc.r")
vv = jyunr::read_tsv("vectors.tsv")
mm = jyunr::read_tsv("metadata.tsv")
ww = email$word
myvv = matrix(0, length(id), ncol(vv))
mymm=numeric(length(id))
idx =0
for (ii in id) {
idx=idx+1
ss=ww[email$SEQID==ii]
ss=ss[ss != "START"]
ss=ss[ss != "END "]
ss=ss[ss != "END_CANCEL"]
## ss=ss[ss != "MENUITEM_help"]
ee = numeric(length(ss))
for (kk in 1:length(ss)) {
ee[kk]= which(ss[kk] == mm)
}
myvv[idx,]=colMeans(vv[ee,])
mymm[idx]=email$response[email$SEQID==ii][1]
}
readr::write_tsv(as.data.frame(mymm),"mymm.tsv", col_names=F)
readr::write_tsv(as.data.frame(myvv),"myvv.tsv", col_names=F)
- I didn’t remove START and END.
- those with small event duration are deleted.
- centroid of action sequence is calculated and used for the clustering.
- timestamp information is not used for clustring
TSNE can identify many subect groups. wouldn’t that be useful for futher analysis?
library(dplyr)
library(stringr)
load('./data/PIAAC_cleaned_data_1110/Problem solving/Problem-solving_cleaned_1110.rdata')
df = PS %>% filter(CODEBOOK == "U21x000S")
ww = paste(df$event_type, ee$event_description)
length(unique(ww))
nn = nrow(df)
trans = paste(ww[1:(nn-1)], "->", ww[2:nn])
length(unique(trans))
core_event = c("COMBOBOX", "TAB", "CHECKBOX")
## core_event = c("MAIL_MOVED")
df$event_description[!(df$event_type %in% core_event)] <- ""
timestamp = df$timestamp
diff = c(0, diff(timestamp))
df = df[diff > 50, ]
df$event_description = stringr::str_replace_all(df$event_description, "\\*\\$index=", "")
df$event_description = stringr::str_replace_all(df$event_description, "u021_(.*?)_","")
df$event_description = stringr::str_replace(df$event_description, "id=", "")
df$event_description = stringr::str_replace(df$event_description, "button", "")
df = df %>% mutate(event_description = ifelse(event_type == "START","",event_description)) %>%
mutate(event_description = ifelse(event_type == "END","",event_description)) %>%
mutate(event_description = ifelse(event_type == "KEYPRESS","",event_description)) %>%
mutate(event_concat = ifelse(event_description == "", event_type, paste0(event_type,"-",event_description))) %>%
mutate(word = gsub(" ", "_", event_concat))
n = nrow(df)
pre = df$word[1:(n-1)]
cur = df$word[2:n]
dif = c(TRUE, !(cur == pre))
idx = logical(n)
for (i in 2:n) {
if(dif[i] == FALSE && dif[i-1] == FALSE) {
idx[i] = TRUE
}
}
df = df[!idx,]
id = unique(df$SEQID)
seqs = character(length(id))
for (i in id) {
## for (word in df$word[df$SEQID == i]) {
## seqs[i] = paste0(seqs[i], " ", word)
## }
seqs[i] = paste(df$word[df$SEQID == i] , collapse = " ")
}
for (i in id) {
seqs[i] = gsub("START ", "", seqs[i])
seqs[i] = gsub(" END", "", seqs[i])
}
seqs = seqs[id]
data.table::fwrite(as.data.frame(seqs), "ticket_sentence.txt", col.names=F)
mm = 0
for (i in id) {
mm = max(mm , length(df$word[df$SEQID == i]))
}
mm
length(unique(df$word))
n = nrow(df)
trans = paste(df$word[1:(n-1)], "->", df$word[2:n])
trans = trans[trans != "END -> START"]
length(unique(trans))
- look at the spreadsheet and calculate the value
library(dplyr)
library(stringr)
load('./data/PIAAC_cleaned_data_1110/Problem_solving/Problem-solving_cleaned_1110.rdata')
df = PS %>% filter(CODEBOOK == "U03a000S")
ww = paste(df$event_type, df$event_description)
length(unique(ww))
nn = nrow(df)
trans = paste(ww[1:(nn-1)], "->", ww[2:nn])
length(unique(trans))
core_event = c("COMBOBOX", "TAB", "CHECKBOX")
## core_event = c("MAIL_MOVED")
df$event_description[!(df$event_type %in% core_event)] <- ""
timestamp = df$timestamp
diff = c(diff(timestamp), 9999)
df = df[diff > 99, ]
df$event_description = stringr::str_replace_all(df$event_description, "\\*\\$index=", "")
df$event_description = stringr::str_replace_all(df$event_description, "u021_(.*?)_","")
df$event_description = stringr::str_replace(df$event_description, "id=", "")
df$event_description = stringr::str_replace(df$event_description, "button", "")
df = df %>% mutate(event_description = ifelse(event_type == "START","",event_description)) %>%
mutate(event_description = ifelse(event_type == "END","",event_description)) %>%
mutate(event_description = ifelse(event_type == "KEYPRESS","",event_description)) %>%
mutate(event_concat = ifelse(event_description == "", event_type, paste0(event_type,"-",event_description))) %>%
mutate(word = gsub(" ", "_", event_concat))
n = nrow(df)
pre = df$word[1:(n-1)]
cur = df$word[2:n]
dif = c(TRUE, !(cur == pre))
idx = logical(n)
for (i in 2:n) {
if(dif[i] == FALSE && dif[i-1] == FALSE) {
idx[i] = TRUE
}
}
df = df[!idx,]
id = unique(df$SEQID)
seqs = character(length(id))
for (i in id) {
## for (word in df$word[df$SEQID == i]) {
## seqs[i] = paste0(seqs[i], " ", word)
## }
seqs[i] = paste(df$word[df$SEQID == i] , collapse = " ")
}
for (i in id) {
seqs[i] = gsub("START ", "", seqs[i])
seqs[i] = gsub(" END", "", seqs[i])
}
seqs = seqs[id]
data.table::fwrite(as.data.frame(seqs), "ticket_sentence.txt", col.names=F)
mm = 0
for (i in id) {
mm = max(mm, length(df$word[df$SEQID == i]))
}
mm
length(unique(df$word))
n = nrow(df)
trans = paste(df$word[1:(n-1)], "->", df$word[2:n])
trans = trans[trans != "END -> START"]
length(unique(trans))
import nltk
from nltk.collocations import *
bigram_measures = nltk.collocations.BigramAssocMeasures()
trigram_measures = nltk.collocations.TrigramAssocMeasures()
fourgram_measures = nltk.collocations.QuadgramAssocMeasures()
webtext
and stopwords
are downloaded from python-shell using nltk.download()
. Ther are saved in ~/nltk/corpus/
from nltk.corpus import webtext
# use to find bigrams, which are pairs of words
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasure
words = [w.lower() for w in webtext.words('singles.txt')]
biagram_collocation = BigramCollocationFinder.from_words(words)
biagram_collocation.nbest(BigramAssocMeasures.likelihood_ratio, 15)
from nltk.corpus import stopwords
stopset = set(stopwords.words('english'))
filter_stops = lambda w: len(w) < 3 or w in stopset
stopset
biagram_collocation.apply_word_filter(filter_stops)
biagram_collocation.nbest(BigramAssocMeasures.likelihood_ratio, 5)
import io
def tuple2file(fobj, fname):
out_m = io.open(fname, 'w', encoding='utf-8')
for vv in fobj:
out_m.write('\t'.join([str(x) for x in vv]) + "\n")
out_m.close()
with open('input/item_sentence.txt') as f:
lines = f.readlines()
def convert(lst):
return ' '.join(lst).split()
# Driver code
print(convert(lines))
words = convert(lines)
import nltk
from nltk.collocations import *
from nltk.collocations import BigramCollocationFinder
from nltk.collocations import TrigramCollocationFinder
from nltk.collocations import QuadgramCollocationFinder
from nltk.collocations import BigramAssocMeasures
from nltk.collocations import TrigramAssocMeasures
from nltk.collocations import QuadgramAssocMeasures
bigram_measures = nltk.collocations.BigramAssocMeasures()
trigram_measures = nltk.collocations.TrigramAssocMeasures()
quadgram_measures = nltk.collocations.QuadgramAssocMeasures()
piacc_stopwords = ["START", "END"]
stopset = set(piacc_stopwords)
filter_stops = lambda w: len(w) < 3 or w in stopset
biagram_collocation = BigramCollocationFinder.from_words(words)
biagram_collocation.apply_word_filter(filter_stops)
# biagram_collocation.nbest(BigramAssocMeasures.likelihood_ratio, 10)
scored = biagram_collocation.score_ngrams(bigram_measures.likelihood_ratio)
bi_ss = sorted(scored, key=lambda score: score[1], reverse = True)[0:10]
triagram_collocation = TrigramCollocationFinder.from_words(words)
triagram_collocation.apply_word_filter(filter_stops)
# triagram_collocation.nbest(TrigramAssocMeasures.likelihood_ratio, 15)
scored = triagram_collocation.score_ngrams(trigram_measures.likelihood_ratio)
tri_ss = sorted(scored, key=lambda score: score[1], reverse = True)[0:10]
quadagram_collocation = QuadgramCollocationFinder.from_words(words)
quadagram_collocation.apply_word_filter(filter_stops)
# quadagram_collocation.nbest(QuadgramAssocMeasures.likelihood_ratio, 10)
scored = quadagram_collocation.score_ngrams(quadgram_measures.likelihood_ratio)
quad_ss = sorted(scored, key=lambda score: score[1], reverse = True)[0:10]
quadagram_collocation.nbest(QuadgramAssocMeasures.likelihood_ratio, 10)
tuple2file(bi_ss, "input/bi_ss.tsv")
tuple2file(tri_ss, "input/tri_ss.tsv")
tuple2file(quad_ss, "input/quad_ss.tsv")
<<lpa_prep>>
item_code_book = c(
"U01a000S",
"U01b000S",
"U03a000S",
"U06a000S",
"U21x000S",
"U19a000S",
"U07x000S",
"U02x000S",
"U11b000S",
"U23x000S")
## ## Party Invitations - 1;
## item_code = "U01a000S"
## ## Party Invitations - 2;
## item_code = "U01b000S"
## ## CD Tally;
## item_code = "U03a000S"
## ## Sprained Ankle - 1;
## item_code = "U06a000S"
## ## Tickets;
## item_code = "U21x000S"
## ## Club Membership - 1;
## item_code = "U19a000S"
## ## Book Order;
## item_code = "U07x000S"
## ## Meeting Room;
## item_code = "U02x000S"
## ## Locate Email;
## item_code = "U11b000S"
## ## Lamp Return;
## item_code = "U23x000S"
for (out_dir in ldir) {
## out_dir = ldir[1]
kk = which(out_dir == ldir)
item_code = item_code_book[kk]
source("R/itemcode.R")
<<piacc_path>>
load(paste0(out_dir, "lpa_membership_mod3.RData"))
item = read_piacc(piacc_path, item_code, sub_str, ignore_str)
mc = max(memb$Class)
for (cc in 1:mc) {
item2sen(item, memb$SEQID[memb$Class == cc])
## system("conda activate tf; python py/collocation.py")
## library(reticulate)
## conda_python("tf")
## source_python("py/collocation.py")
## library(reticulate)
reticulate::use_condaenv("tf")
## reticulate::use_python()
reticulate::source_python("py/collocation.py")
file.rename("input/bi_ss.tsv", paste0("input/mod3_bi_ss_class_", cc,".tsv"))
file.rename("input/tri_ss.tsv", paste0("input/mod3_tri_ss_class_", cc,".tsv"))
file.rename("input/quad_ss.tsv", paste0("input/quad_ss_class_", cc,".tsv"))
}
system(paste0("mv input/*.tsv ", out_dir))
}