From 41c7f7e4358a4054167fc5630925d9d1fa73a20e Mon Sep 17 00:00:00 2001 From: Alex Peters <80283343+alexpeters1208@users.noreply.github.com> Date: Wed, 3 Jul 2024 12:25:23 -0500 Subject: [PATCH] feat: add dx.data.jobs and dx.data.marketing example data sets (#595) Add datasets for funnel, funnel_area, and timeline plot types --------- Co-authored-by: dsmmcken --- .../plot/express/data/data_generators.py | 153 ++++++++++++++++-- 1 file changed, 141 insertions(+), 12 deletions(-) diff --git a/plugins/plotly-express/src/deephaven/plot/express/data/data_generators.py b/plugins/plotly-express/src/deephaven/plot/express/data/data_generators.py index 0acef0370..cf3c6ae66 100644 --- a/plugins/plotly-express/src/deephaven/plot/express/data/data_generators.py +++ b/plugins/plotly-express/src/deephaven/plot/express/data/data_generators.py @@ -110,10 +110,10 @@ def get_index(species: str) -> int: "timestamp = base_time + (long)((ii + df_len) * SECOND)", # pick a random species from the list, using the index as a seed "species = (String)species_list[(int)new Random(ii).nextInt(3)]", - "sepal_length = get_random_value(`sepal_length`, ii, species)", - "sepal_width = get_random_value(`sepal_width`, ii, species)", - "petal_length = get_random_value(`petal_length`, ii, species)", - "petal_width = get_random_value(`petal_width`, ii, species)", + "sepal_length = get_random_value(`sepal_length`, ii + 1, species)", + "sepal_width = get_random_value(`sepal_width`, ii + 2, species)", + "petal_length = get_random_value(`petal_length`, ii + 3, species)", + "petal_width = get_random_value(`petal_width`, ii + 4, species)", "species_id = get_index(species)", ] ) @@ -125,6 +125,135 @@ def get_index(species: str) -> int: return source_table +def jobs(ticking: bool = True) -> Table: + """ + Returns a synthetic dataset containing five different jobs and their durations over time. + + This dataset is intended to be used with a timeline plot. It demonstrates five different "jobs", each starting + two days after the previous, and each lasting 5 days in total. The job's "resource", or the name of the individual + assigned to the job, is randomly selected. The dataset continues to loop in this way, moving across time until + it is deleted or the server is shut down. + + Notes: + Contains the following columns: + - Job: a string column denoting the name of the job, ranging from Job1 to Job5 + - StartTime: a Java Instant column containing the start time of the job + - EndTime: a Java Instant column containing the end time of the job + - Resource: a string column indicating the name of the person that the job is assigned to + + Args: + ticking: + If true, the table will tick new data every second. + + Returns: + A Deephaven Table + + Examples: + ``` + from deephaven.plot import express as dx + jobs = dx.data.jobs() + ``` + """ + + def generate_resource(index: int) -> str: + random.seed(index) + return random.choice(["Mike", "Matti", "Steve", "John", "Jane"]) + + jobs_query_strings = [ + "Job = `Job` + String.valueOf((ii % 5) + 1)", + "StartTime = '2020-01-01T00:00:00Z' + ('P1d' * i * 2)", + "EndTime = StartTime + 'P5d'", + "Resource = generate_resource(ii)", + ] + + static_jobs = empty_table(5).update(jobs_query_strings) + + if not ticking: + return static_jobs + + ticking_jobs = merge( + [ + static_jobs, + time_table("PT1s") + .drop_columns("Timestamp") + .update(jobs_query_strings) + .update("StartTime = StartTime + 'P10d'"), + ] + ).last_by("Job") + + return ticking_jobs + + +def marketing(ticking: bool = True) -> Table: + """ + Returns a synthetic ticking dataset tracking the movement of customers from website visit to product purchase. + + This dataset is intended to be used with the `dx.funnel` and `dx.funnel_area` plot types. Each row in this dataset + represents an individual that has visited a company website. The individual may download an instance of the product, + be considered a potential customer, formally request the price of the product, or purchase the product and receive + an invoice. Each of these categories is a strict subset of the last, so it lends itself well to funnel plots. + + Notes: + Contains the following columns: + - Stage: a string column containing the stage of a customers interest: + VisitedWebsite, Downloaded, PotentialCustomer, RequestedPrice, and InvoiceSent + - Count: an integer column counting the number of customers to fall into each category + + Args: + ticking: + If true, the table will tick new data every second. + + Returns: + A Deephaven Table + + Examples: + ``` + from deephaven.plot import express as dx + marketing = dx.data.marketing() + ``` + """ + _ColsToRowsTransform = jpy.get_type( + "io.deephaven.engine.table.impl.util.ColumnsToRowsTransform" + ) + + def weighted_selection(prob: float, index: int) -> bool: + random.seed(index) + return random.uniform(0, 1) < prob + + marketing_query_strings = [ + "VisitedWebsite = true", # appearing in this table assumes a website visit + "Downloaded = VisitedWebsite ? weighted_selection(0.45, ii) : false", # 45% of visits download product + "PotentialCustomer = Downloaded ? weighted_selection(0.77, ii + 1) : false", # 77% of downloads are potential customers + "RequestedPrice = PotentialCustomer ? weighted_selection(0.82, ii + 2) : false", # 82% of flagged potential customers request price + "InvoiceSent = RequestedPrice ? weighted_selection(0.24, ii + 3) : false", # 24% of those who requested price get invoice + ] + + marketing_table = empty_table(100).update(marketing_query_strings) + + if ticking: + marketing_table = merge( + [ + marketing_table, + time_table("PT1s") + .update(marketing_query_strings) + .drop_columns("Timestamp"), + ] + ) + + return Table( + _ColsToRowsTransform.columnsToRows( + marketing_table.sum_by().j_table, + "Stage", + "Count", + "VisitedWebsite", + "Downloaded", + "PotentialCustomer", + "RequestedPrice", + "InvoiceSent", + ) + ) + + def stocks(ticking: bool = True, hours_of_data: int = 1) -> Table: """Returns a Deephaven table containing a generated example data set. @@ -353,23 +482,23 @@ def generate_sex(index: int) -> str: return random.choices(sex_list, weights=sex_probs)[0] def generate_smoker(index: int) -> str: - random.seed(index) + random.seed(index + 1) return random.choices(smoker_list, weights=smoker_probs)[0] def generate_day(index: int) -> str: - random.seed(index) + random.seed(index + 2) return random.choices(day_list, weights=day_probs)[0] def generate_time(index: int) -> str: - random.seed(index) + random.seed(index + 3) return random.choices(time_list, weights=time_probs)[0] def generate_size(index: int) -> int: - random.seed(index) + random.seed(index + 4) return random.choices(size_list, weights=size_probs)[0] def generate_total_bill(smoker: str, size: int, index: int) -> float: - random.seed(index) + random.seed(index + 5) return round( 3.68 + 3.08 * (smoker == "Yes") @@ -379,7 +508,7 @@ def generate_total_bill(smoker: str, size: int, index: int) -> float: ) def generate_tip(total_bill: float, index: int) -> float: - random.seed(index) + random.seed(index + 6) return max(1, round(0.92 + 0.11 * total_bill + random.gauss(0.0, 1.02), 2)) # create synthetic ticking version of the tips dataset that generates one new observation per period @@ -403,7 +532,7 @@ def generate_tip(total_bill: float, index: int) -> float: return merge([tips_table, ticking_table]) -def election(ticking: bool = True): +def election(ticking: bool = True) -> Table: """ Returns a ticking version of the Election dataset included in the plotly-express package. @@ -488,7 +617,7 @@ def get_long_val(column: str, index: int) -> int: return merge([election_table.head(STATIC_ROWS - 1), ticking_table]) -def wind(ticking: bool = True): +def wind(ticking: bool = True) -> Table: """ Returns a ticking version of the Wind dataset included in the plotly-express package.