Skip to content

Commit

Permalink
Added code to emit stats in JSON
Browse files Browse the repository at this point in the history
  • Loading branch information
josephg committed Jan 6, 2024
1 parent 80275aa commit 83f4d1e
Show file tree
Hide file tree
Showing 3 changed files with 122 additions and 44 deletions.
24 changes: 19 additions & 5 deletions examples/posstats.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,16 @@ pub fn apply_edits_direct(doc: &mut ListCRDT, txns: &Vec<TestTxn>) {
}
}

// cargo run --example posstats --release --features gen_test_data
fn write_stats(name: &str, oplog: &ListOpLog) {
if cfg!(feature = "gen_test_data") {
let stats = oplog.get_stats();
let data = serde_json::to_string_pretty(&stats).unwrap();
let stats_file = format!("stats_{}.json", name);
std::fs::write(&stats_file, data).unwrap();
println!("Wrote stats to {stats_file}");
}
}

#[allow(unused)]
fn print_stats_for_testdata(name: &str) {
Expand Down Expand Up @@ -82,11 +92,13 @@ fn print_stats_for_testdata(name: &str) {
println!("Regular file size {} bytes", data.len());
std::fs::write(out_file.clone(), data.as_slice()).unwrap();
println!("Saved to {}", out_file);

write_stats(name, &doc.oplog);
}

#[allow(unused)]
fn print_stats_for_file(name: &str) {
let contents = std::fs::read(name).unwrap();
let contents = std::fs::read(&format!("benchmark_data/{name}.dt")).unwrap();
println!("\n\nLoaded testing data from {} ({} bytes)", name, contents.len());

#[cfg(feature = "memusage")]
Expand Down Expand Up @@ -133,6 +145,8 @@ fn print_stats_for_file(name: &str) {
get_thread_num_allocations() - start_count);

println!("Resulting document size {} characters", state.len_chars());

write_stats(name, &oplog);
}

// This is a dirty addition for profiling.
Expand Down Expand Up @@ -160,9 +174,9 @@ fn main() {
// print_stats_for_testdata("sveltecomponent");
print_stats_for_testdata("seph-blog1");

print_stats_for_file("benchmark_data/node_nodecc.dt");
print_stats_for_file("benchmark_data/git-makefile.dt");
print_stats_for_file("node_nodecc");
print_stats_for_file("git-makefile");

print_stats_for_file("benchmark_data/friendsforever.dt");
print_stats_for_file("benchmark_data/clownschool.dt");
print_stats_for_file("friendsforever");
print_stats_for_file("clownschool");
}
122 changes: 84 additions & 38 deletions src/list/oplog.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@ use crate::rev_range::RangeRev;
use crate::rle::KVPair;
use crate::unicount::{chars_to_bytes, count_chars};

#[cfg(feature = "serde")]
use serde::{Deserialize, Serialize};
use crate::rle::rle_vec::RleStats;

impl Default for ListOpLog {
fn default() -> Self {
Self::new()
Expand Down Expand Up @@ -395,6 +399,65 @@ impl ListOpLog {
.map(|item| self.cg.agent_assignment.agent_span_to_remote(item.1))
}

/// Check if the specified version contains the specified point in time.
// Exported for the fuzzer. Not sure if I actually want this exposed.
pub fn version_contains_lv(&self, local_version: &[LV], target: LV) -> bool {
if local_version.is_empty() { true }
else { self.cg.graph.frontier_contains_version(local_version, target) }
}

// /// Returns all the changes since some (static) point in time.
// pub fn linear_changes_since(&self, start: Time) -> TimeSpan {
// TimeSpan::new(start, self.len())
// }

/// Take the union of two versions.
///
/// One way to think of a version is the name of some subset of operations in the operation log.
/// But a local time array only explicitly names versions at the "tip" of the time DAG. For
/// example, if we have 3 operations: A, B, C with ROOT <- A <- B <- C, then the local version
/// will only name `{C}`, since A and B are implicit.
///
/// version_union takes two versions and figures out the set union for all the contained
/// changes, and returns the version name for that union. `version_union(a, b)` will often
/// simply return `a` or `b`. This happens when one of the versions is a strict subset of the
/// other.
pub fn version_union(&self, a: &[LV], b: &[LV]) -> Frontier {
self.cg.graph.version_union(a, b)
}

pub fn parents_at_version(&self, lv: LV) -> Frontier {
self.cg.graph.parents_at_version(lv)
}

pub(crate) fn estimate_cost(&self, op_range: DTRange) -> usize {
if op_range.is_empty() { return 0; }
else {
let start_idx = self.operations.find_index(op_range.start).unwrap();
let end_idx = self.operations.find_index(op_range.last()).unwrap();

end_idx - start_idx + 1
}
}
}

#[derive(Clone, Debug)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
pub struct ListOpLogStats {
op_stats: RleStats,
graph_stats: RleStats,
aa_stats: RleStats,

num_insert_keystrokes: usize,
num_delete_keystrokes: usize,
total_keystrokes: usize,
concurrency_estimate: f32,
graph_rle_size: usize,
num_agents: usize,
}

impl ListOpLog {

pub fn print_stats(&self, detailed: bool) {
self.operations.print_stats("Operations", detailed);

Expand Down Expand Up @@ -450,48 +513,31 @@ impl ListOpLog {

let concurrency = self.cg.graph.estimate_concurrency(self.cg.version.as_ref());
println!("Concurrency estimate: {concurrency}");


}

/// Check if the specified version contains the specified point in time.
// Exported for the fuzzer. Not sure if I actually want this exposed.
pub fn version_contains_time(&self, local_version: &[LV], target: LV) -> bool {
if local_version.is_empty() { true }
else { self.cg.graph.frontier_contains_version(local_version, target) }
}

// /// Returns all the changes since some (static) point in time.
// pub fn linear_changes_since(&self, start: Time) -> TimeSpan {
// TimeSpan::new(start, self.len())
// }

/// Take the union of two versions.
///
/// One way to think of a version is the name of some subset of operations in the operation log.
/// But a local time array only explicitly names versions at the "tip" of the time DAG. For
/// example, if we have 3 operations: A, B, C with ROOT <- A <- B <- C, then the local version
/// will only name `{C}`, since A and B are implicit.
///
/// version_union takes two versions and figures out the set union for all the contained
/// changes, and returns the version name for that union. `version_union(a, b)` will often
/// simply return `a` or `b`. This happens when one of the versions is a strict subset of the
/// other.
pub fn version_union(&self, a: &[LV], b: &[LV]) -> Frontier {
self.cg.graph.version_union(a, b)
}

pub fn parents_at_version(&self, lv: LV) -> Frontier {
self.cg.graph.parents_at_version(lv)
}
pub fn get_stats(&self) -> ListOpLogStats {
let mut i_k = 0;
let mut d_k = 0;

pub(crate) fn estimate_cost(&self, op_range: DTRange) -> usize {
if op_range.is_empty() { return 0; }
else {
let start_idx = self.operations.find_index(op_range.start).unwrap();
let end_idx = self.operations.find_index(op_range.last()).unwrap();
for op in self.operations.iter_merged() {
match op.1.kind {
ListOpKind::Ins => i_k += op.len(),
ListOpKind::Del => d_k += op.len(),
}
}

end_idx - start_idx + 1
ListOpLogStats {
op_stats: self.operations.get_stats(),
graph_stats: self.cg.graph.entries.get_stats(),
aa_stats: self.cg.agent_assignment.client_with_lv.get_stats(),

num_insert_keystrokes: i_k,
num_delete_keystrokes: d_k,
total_keystrokes: i_k + d_k,
num_agents: self.cg.agent_assignment.client_data.len(),
concurrency_estimate: self.cg.graph.estimate_concurrency(self.cg.version.as_ref()),
graph_rle_size: self.cg.graph.count_all_graph_entries(self.cg.version.as_ref()),
}
}

}
20 changes: 19 additions & 1 deletion src/rle/rle_vec.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,23 @@ use humansize::{DECIMAL, format_size};
use rle::{AppendRle, HasLength, MergableSpan, MergeableIterator, MergeIter, SplitableSpan, SplitableSpanCtx};
use rle::Searchable;
use crate::dtrange::DTRange;

use crate::rle::{HasRleKey, RleKeyedAndSplitable, RleSpanHelpers};

#[cfg(feature = "serde")]
use serde::{Deserialize, Serialize};

// Each entry has a key (which we search by), a span and a value at that key.
#[derive(Clone, Eq, PartialEq, Debug)]
pub struct RleVec<V: HasLength + MergableSpan + Sized>(pub Vec<V>);

#[derive(Clone, Debug)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
pub struct RleStats {
entry_byte_size: usize,
len: usize,
capacity: usize,
}

impl<V: HasLength + MergableSpan + Sized> RleVec<V> {
pub fn new() -> Self { Self(Vec::new()) }

Expand Down Expand Up @@ -64,6 +74,14 @@ impl<V: HasLength + MergableSpan + Sized> RleVec<V> {

pub fn iter_merged(&self) -> MergeIter<Cloned<std::slice::Iter<V>>> { self.0.iter().cloned().merge_spans() }

pub fn get_stats(&self) -> RleStats {
RleStats {
entry_byte_size: std::mem::size_of::<V>(),
len: self.0.len(),
capacity: self.0.capacity(),
}
}

pub fn print_stats(&self, name: &str, _detailed: bool) {
let size = std::mem::size_of::<V>();
println!("-------- {} RLE --------", name);
Expand Down

0 comments on commit 83f4d1e

Please sign in to comment.