Skip to content

Commit

Permalink
feat: add dx.data.jobs and dx.data.marketing example data sets (deeph…
Browse files Browse the repository at this point in the history
…aven#595)

Add datasets for funnel, funnel_area, and timeline plot types

---------

Co-authored-by: dsmmcken <dsmmcken@gmail.com>
  • Loading branch information
alexpeters1208 and dsmmcken authored Jul 3, 2024
1 parent ad8aad9 commit 41c7f7e
Showing 1 changed file with 141 additions and 12 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -110,10 +110,10 @@ def get_index(species: str) -> int:
"timestamp = base_time + (long)((ii + df_len) * SECOND)",
# pick a random species from the list, using the index as a seed
"species = (String)species_list[(int)new Random(ii).nextInt(3)]",
"sepal_length = get_random_value(`sepal_length`, ii, species)",
"sepal_width = get_random_value(`sepal_width`, ii, species)",
"petal_length = get_random_value(`petal_length`, ii, species)",
"petal_width = get_random_value(`petal_width`, ii, species)",
"sepal_length = get_random_value(`sepal_length`, ii + 1, species)",
"sepal_width = get_random_value(`sepal_width`, ii + 2, species)",
"petal_length = get_random_value(`petal_length`, ii + 3, species)",
"petal_width = get_random_value(`petal_width`, ii + 4, species)",
"species_id = get_index(species)",
]
)
Expand All @@ -125,6 +125,135 @@ def get_index(species: str) -> int:
return source_table


def jobs(ticking: bool = True) -> Table:
"""
Returns a synthetic dataset containing five different jobs and their durations over time.
This dataset is intended to be used with a timeline plot. It demonstrates five different "jobs", each starting
two days after the previous, and each lasting 5 days in total. The job's "resource", or the name of the individual
assigned to the job, is randomly selected. The dataset continues to loop in this way, moving across time until
it is deleted or the server is shut down.
Notes:
Contains the following columns:
- Job: a string column denoting the name of the job, ranging from Job1 to Job5
- StartTime: a Java Instant column containing the start time of the job
- EndTime: a Java Instant column containing the end time of the job
- Resource: a string column indicating the name of the person that the job is assigned to
Args:
ticking:
If true, the table will tick new data every second.
Returns:
A Deephaven Table
Examples:
```
from deephaven.plot import express as dx
jobs = dx.data.jobs()
```
"""

def generate_resource(index: int) -> str:
random.seed(index)
return random.choice(["Mike", "Matti", "Steve", "John", "Jane"])

jobs_query_strings = [
"Job = `Job` + String.valueOf((ii % 5) + 1)",
"StartTime = '2020-01-01T00:00:00Z' + ('P1d' * i * 2)",
"EndTime = StartTime + 'P5d'",
"Resource = generate_resource(ii)",
]

static_jobs = empty_table(5).update(jobs_query_strings)

if not ticking:
return static_jobs

ticking_jobs = merge(
[
static_jobs,
time_table("PT1s")
.drop_columns("Timestamp")
.update(jobs_query_strings)
.update("StartTime = StartTime + 'P10d'"),
]
).last_by("Job")

return ticking_jobs


def marketing(ticking: bool = True) -> Table:
"""
Returns a synthetic ticking dataset tracking the movement of customers from website visit to product purchase.
This dataset is intended to be used with the `dx.funnel` and `dx.funnel_area` plot types. Each row in this dataset
represents an individual that has visited a company website. The individual may download an instance of the product,
be considered a potential customer, formally request the price of the product, or purchase the product and receive
an invoice. Each of these categories is a strict subset of the last, so it lends itself well to funnel plots.
Notes:
Contains the following columns:
- Stage: a string column containing the stage of a customers interest:
VisitedWebsite, Downloaded, PotentialCustomer, RequestedPrice, and InvoiceSent
- Count: an integer column counting the number of customers to fall into each category
Args:
ticking:
If true, the table will tick new data every second.
Returns:
A Deephaven Table
Examples:
```
from deephaven.plot import express as dx
marketing = dx.data.marketing()
```
"""
_ColsToRowsTransform = jpy.get_type(
"io.deephaven.engine.table.impl.util.ColumnsToRowsTransform"
)

def weighted_selection(prob: float, index: int) -> bool:
random.seed(index)
return random.uniform(0, 1) < prob

marketing_query_strings = [
"VisitedWebsite = true", # appearing in this table assumes a website visit
"Downloaded = VisitedWebsite ? weighted_selection(0.45, ii) : false", # 45% of visits download product
"PotentialCustomer = Downloaded ? weighted_selection(0.77, ii + 1) : false", # 77% of downloads are potential customers
"RequestedPrice = PotentialCustomer ? weighted_selection(0.82, ii + 2) : false", # 82% of flagged potential customers request price
"InvoiceSent = RequestedPrice ? weighted_selection(0.24, ii + 3) : false", # 24% of those who requested price get invoice
]

marketing_table = empty_table(100).update(marketing_query_strings)

if ticking:
marketing_table = merge(
[
marketing_table,
time_table("PT1s")
.update(marketing_query_strings)
.drop_columns("Timestamp"),
]
)

return Table(
_ColsToRowsTransform.columnsToRows(
marketing_table.sum_by().j_table,
"Stage",
"Count",
"VisitedWebsite",
"Downloaded",
"PotentialCustomer",
"RequestedPrice",
"InvoiceSent",
)
)


def stocks(ticking: bool = True, hours_of_data: int = 1) -> Table:
"""Returns a Deephaven table containing a generated example data set.
Expand Down Expand Up @@ -353,23 +482,23 @@ def generate_sex(index: int) -> str:
return random.choices(sex_list, weights=sex_probs)[0]

def generate_smoker(index: int) -> str:
random.seed(index)
random.seed(index + 1)
return random.choices(smoker_list, weights=smoker_probs)[0]

def generate_day(index: int) -> str:
random.seed(index)
random.seed(index + 2)
return random.choices(day_list, weights=day_probs)[0]

def generate_time(index: int) -> str:
random.seed(index)
random.seed(index + 3)
return random.choices(time_list, weights=time_probs)[0]

def generate_size(index: int) -> int:
random.seed(index)
random.seed(index + 4)
return random.choices(size_list, weights=size_probs)[0]

def generate_total_bill(smoker: str, size: int, index: int) -> float:
random.seed(index)
random.seed(index + 5)
return round(
3.68
+ 3.08 * (smoker == "Yes")
Expand All @@ -379,7 +508,7 @@ def generate_total_bill(smoker: str, size: int, index: int) -> float:
)

def generate_tip(total_bill: float, index: int) -> float:
random.seed(index)
random.seed(index + 6)
return max(1, round(0.92 + 0.11 * total_bill + random.gauss(0.0, 1.02), 2))

# create synthetic ticking version of the tips dataset that generates one new observation per period
Expand All @@ -403,7 +532,7 @@ def generate_tip(total_bill: float, index: int) -> float:
return merge([tips_table, ticking_table])


def election(ticking: bool = True):
def election(ticking: bool = True) -> Table:
"""
Returns a ticking version of the Election dataset included in the plotly-express package.
Expand Down Expand Up @@ -488,7 +617,7 @@ def get_long_val(column: str, index: int) -> int:
return merge([election_table.head(STATIC_ROWS - 1), ticking_table])


def wind(ticking: bool = True):
def wind(ticking: bool = True) -> Table:
"""
Returns a ticking version of the Wind dataset included in the plotly-express package.
Expand Down

0 comments on commit 41c7f7e

Please sign in to comment.