From b126d656da160d06c11e4fae2d2733ac7bc81b03 Mon Sep 17 00:00:00 2001 From: Gaurav Sheni Date: Wed, 28 Jun 2023 17:46:58 -0400 Subject: [PATCH] WIP --- scripts/walmart.py | 20 --- scripts/yelp.py | 83 ------------ tests/integration_tests/test_examples.py | 67 ++-------- ...est_datasets.py => test_load_functions.py} | 93 ++++++++----- trane/__init__.py | 2 +- trane/core/prediction_problem.py | 44 +++---- trane/core/prediction_problem_evaluator.py | 78 +++++------ trane/core/prediction_problem_saver.py | 5 +- trane/datasets/__init__.py | 2 +- trane/datasets/load_functions.py | 124 +++++++++--------- trane/utils/__init__.py | 2 +- trane/utils/data_parser.py | 32 +---- trane/utils/helper.py | 11 +- 13 files changed, 195 insertions(+), 368 deletions(-) delete mode 100644 scripts/walmart.py delete mode 100644 scripts/yelp.py rename tests/{test_datasets.py => test_load_functions.py} (68%) diff --git a/scripts/walmart.py b/scripts/walmart.py deleted file mode 100644 index 6c7f837a..00000000 --- a/scripts/walmart.py +++ /dev/null @@ -1,20 +0,0 @@ -import pandas as pd -import pyarrow as pa - - -def create_walmart_parquets(): - sales = pd.read_csv("train.csv", dtype_backend="pyarrow") - stores = pd.read_csv("stores.csv", dtype_backend="pyarrow") - features = pd.read_csv("features.csv", dtype_backend="pyarrow") - - pa_type = pd.ArrowDtype(pa.timestamp("s")) - - sales["Date"] = sales["Date"].astype(pa_type) - features["Date"] = features["Date"].astype(pa_type) - - sales.to_parquet("sales.parquet") - stores.to_parquet("stores.parquet") - features.to_parquet("features.parquet") - - -create_walmart_parquets() diff --git a/scripts/yelp.py b/scripts/yelp.py deleted file mode 100644 index aea1a53e..00000000 --- a/scripts/yelp.py +++ /dev/null @@ -1,83 +0,0 @@ -import os - -import pandas as pd - - -def read_json( - dir_path, - filename, - nrows=None, - lines=True, - dtype_backend="numpy_nullable", - engine="ujson", -): - path = os.path.join(dir_path, filename) - return pd.read_json(path, lines=lines, nrows=nrows) - - -def create_yelp_parquets(): - nrows = 50000 - dir_path = "." - yelp_review_df = read_json( - dir_path, - "yelp_academic_dataset_review.json", - nrows=nrows, - ) - yelp_business_df = read_json( - dir_path, - "yelp_academic_dataset_business.json", - engine="ujson", - nrows=nrows, - ) - yelp_user_df = read_json(dir_path, "yelp_academic_dataset_user.json") - - # don't care about these columns - yelp_business_df = yelp_business_df.drop( - columns=["attributes", "categories", "hours"], - ) - - # from our largest dataframe, find the unique for the foreign keys - valid_business_ids = yelp_review_df["business_id"].unique() - valid_user_ids = yelp_review_df["user_id"].unique() - - # now we need to make sure that the foreign keys in the review dataframe are valid - yelp_business_df = yelp_business_df[ - yelp_business_df["business_id"].isin(valid_business_ids) - ] - yelp_user_df = yelp_user_df[yelp_user_df["user_id"].isin(valid_user_ids)] - - valid_user_ids = yelp_user_df["user_id"].unique() - yelp_review_df = yelp_review_df[yelp_review_df["user_id"].isin(valid_user_ids)] - - # check our primary keys are unique - assert yelp_review_df["review_id"].is_unique - assert yelp_user_df["user_id"].is_unique - assert yelp_business_df["business_id"].is_unique - - # check our foreign keys are valid - assert len(yelp_review_df["user_id"].unique()) == len(yelp_user_df) - assert len(yelp_review_df["business_id"].unique()) == len(yelp_business_df) - - print("Sampling Results ---") - print("Number of reviews: {}".format(len(yelp_review_df))) - print("Number of businesses: {}".format(len(yelp_business_df))) - print("Number of users: {}".format(len(yelp_user_df))) - - merge_step_1 = yelp_review_df.merge( - yelp_user_df, - on="user_id", - suffixes=("_review", "_user"), - ) - merged_df = merge_step_1.merge( - yelp_business_df, - on="business_id", - suffixes=(None, "_business"), - ) - - merged_df["date"] = pd.to_datetime(merged_df["date"]) - merged_df = merged_df.rename(columns={"stars_x": "stars"}) - - merged_df.to_parquet("yelp.parquet") - - -create_yelp_parquets() diff --git a/tests/integration_tests/test_examples.py b/tests/integration_tests/test_examples.py index 5f264d78..0cff2b94 100644 --- a/tests/integration_tests/test_examples.py +++ b/tests/integration_tests/test_examples.py @@ -3,15 +3,13 @@ import pandas as pd import pytest -from woodwork.column_schema import ColumnSchema -from woodwork.logical_types import ( - Categorical, - Datetime, - Double, - Integer, -) import trane +from trane.datasets.load_functions import ( + load_bike_metadata, + load_covid_metadata, + load_youtube_metadata, +) from .utils import generate_and_verify_prediction_problem @@ -34,21 +32,7 @@ def df_youtube(current_dir): @pytest.fixture def meta_youtube(current_dir): - table_meta = { - "trending_date": ColumnSchema(logical_type=Datetime), - "channel_title": ColumnSchema( - logical_type=Categorical, - semantic_tags={"index"}, - ), - "category_id": ColumnSchema( - logical_type=Categorical, - semantic_tags={"category", "index"}, - ), - "views": ColumnSchema(logical_type=Integer, semantic_tags={"numeric"}), - "likes": ColumnSchema(logical_type=Integer, semantic_tags={"numeric"}), - "dislikes": ColumnSchema(logical_type=Integer, semantic_tags={"numeric"}), - "comment_count": ColumnSchema(logical_type=Integer, semantic_tags={"numeric"}), - } + table_meta = load_youtube_metadata() return table_meta @@ -67,22 +51,7 @@ def df_covid(current_dir): @pytest.fixture def meta_covid(current_dir): - table_meta = { - "Province/State": ColumnSchema( - logical_type=Categorical, - semantic_tags={"category"}, - ), - "Country/Region": ColumnSchema( - logical_type=Categorical, - semantic_tags={"category", "index"}, - ), - "Lat": ColumnSchema(logical_type=Double, semantic_tags={"numeric"}), - "Long": ColumnSchema(logical_type=Double, semantic_tags={"numeric"}), - "Date": ColumnSchema(logical_type=Datetime), - "Confirmed": ColumnSchema(logical_type=Integer, semantic_tags={"numeric"}), - "Deaths": ColumnSchema(logical_type=Integer, semantic_tags={"numeric"}), - "Recovered": ColumnSchema(logical_type=Integer, semantic_tags={"numeric"}), - } + table_meta = load_covid_metadata() return table_meta @@ -99,27 +68,7 @@ def df_chicago(current_dir): @pytest.fixture def meta_chicago(current_dir): - table_meta = { - "date": ColumnSchema(logical_type=Datetime), - "hour": ColumnSchema(logical_type=Categorical, semantic_tags={"category"}), - "usertype": ColumnSchema(logical_type=Categorical, semantic_tags={"category"}), - "gender": ColumnSchema(logical_type=Categorical, semantic_tags={"category"}), - "tripduration": ColumnSchema(logical_type=Double, semantic_tags={"numeric"}), - "temperature": ColumnSchema(logical_type=Double, semantic_tags={"numeric"}), - "from_station_id": ColumnSchema( - logical_type=Categorical, - semantic_tags={"index"}, - ), - "dpcapacity_start": ColumnSchema( - logical_type=Integer, - semantic_tags={"numeric"}, - ), - "to_station_id": ColumnSchema( - logical_type=Categorical, - semantic_tags={"index"}, - ), - "dpcapacity_end": ColumnSchema(logical_type=Integer, semantic_tags={"numeric"}), - } + table_meta = load_bike_metadata() return table_meta diff --git a/tests/test_datasets.py b/tests/test_load_functions.py similarity index 68% rename from tests/test_datasets.py rename to tests/test_load_functions.py index 4f97e58b..16e46b17 100644 --- a/tests/test_datasets.py +++ b/tests/test_load_functions.py @@ -1,13 +1,22 @@ +from woodwork.column_schema import ColumnSchema +from woodwork.logical_types import ( + Datetime, +) + from trane.datasets.load_functions import ( load_bike, + load_bike_metadata, load_covid, + load_covid_metadata, load_youtube, + load_youtube_metadata, ) def test_load_covid(): df = load_covid() - for col in [ + metadata = load_covid_metadata() + expected_columns = [ "Province/State", "Country/Region", "Lat", @@ -16,10 +25,55 @@ def test_load_covid(): "Confirmed", "Deaths", "Recovered", - ]: - assert col in df.columns + ] + check_column_schema(expected_columns, df, metadata) assert len(df) >= 17136 assert df["Date"].dtype == "datetime64[ns]" + assert metadata["Date"] == ColumnSchema(logical_type=Datetime) + + +def test_load_bike(): + df = load_bike() + metadata = load_bike_metadata() + expected_columns = [ + "date", + "hour", + "usertype", + "gender", + "tripduration", + "temperature", + "from_station_id", + "dpcapacity_start", + "to_station_id", + "dpcapacity_end", + ] + check_column_schema(expected_columns, df, metadata) + assert df["date"].dtype == "datetime64[ns]" + assert metadata["date"] == ColumnSchema(logical_type=Datetime) + + +def test_load_youtube(): + df = load_youtube() + metadata = load_youtube_metadata() + expected_columns = [ + "trending_date", + "channel_title", + "category_id", + "views", + "likes", + "dislikes", + "comment_count", + ] + check_column_schema(expected_columns, df, metadata) + assert df["trending_date"].dtype == "datetime64[ns]" + assert metadata["trending_date"] == ColumnSchema(logical_type=Datetime) + + +def check_column_schema(columns, df, metadata): + for col in columns: + assert col in df.columns + assert col in metadata.keys() + assert isinstance(metadata[col], ColumnSchema) # def test_load_flight(): @@ -60,36 +114,3 @@ def test_load_covid(): # assert col in flights_df.columns # assert flights_df["DATE"].dtype == "datetime64[ns]" - - -def test_load_bike(): - df = load_bike() - for col in [ - "date", - "hour", - "usertype", - "gender", - "tripduration", - "temperature", - "from_station_id", - "dpcapacity_start", - "to_station_id", - "dpcapacity_end", - ]: - assert col in df.columns - assert df["date"].dtype == "datetime64[ns]" - - -def test_load_youtube(): - df = load_youtube() - for col in [ - "trending_date", - "channel_title", - "category_id", - "views", - "likes", - "dislikes", - "comment_count", - ]: - assert col in df.columns - assert df["trending_date"].dtype == "datetime64[ns]" diff --git a/trane/__init__.py b/trane/__init__.py index e25b6818..73bf8092 100755 --- a/trane/__init__.py +++ b/trane/__init__.py @@ -1,7 +1,7 @@ from trane.core import * # noqa from trane.datasets import ( load_covid, - load_covid_tablemeta, + load_covid_metadata, load_bike, load_youtube, load_youtube_metadata, diff --git a/trane/core/prediction_problem.py b/trane/core/prediction_problem.py index 0f6b435f..7704e0bb 100755 --- a/trane/core/prediction_problem.py +++ b/trane/core/prediction_problem.py @@ -22,7 +22,6 @@ LessFilterOp, NeqFilterOp, ) -from trane.utils.table_meta import TableMeta __all__ = ["PredictionProblem"] @@ -453,51 +452,50 @@ def _check_type(self, expected_type, actual_data): ), ) - allowed_types_category = [bool, int, str, float] allowed_types_bool = [bool, np.bool_] allowed_types_text = [str] allowed_types_int = [int, np.int64] allowed_types_float = [float, np.float64, np.float32] - allowed_types_time = ( + ( allowed_types_bool + allowed_types_int + allowed_types_text + allowed_types_float ) - allowed_types_ordered = ( + ( allowed_types_bool + allowed_types_int + allowed_types_text + allowed_types_float ) - allowed_types_id = allowed_types_int + allowed_types_text + allowed_types_float + allowed_types_int + allowed_types_text + allowed_types_float - if expected_type == TableMeta.TYPE_CATEGORY: - assert type(actual_data) in allowed_types_category + # if expected_type == TableMeta.TYPE_CATEGORY: + # assert type(actual_data) in allowed_types_category - elif expected_type == TableMeta.TYPE_BOOL: - assert type(actual_data) in allowed_types_bool + # elif expected_type == TableMeta.TYPE_BOOL: + # assert type(actual_data) in allowed_types_bool - elif expected_type == TableMeta.TYPE_ORDERED: - assert type(actual_data) in allowed_types_ordered + # elif expected_type == TableMeta.TYPE_ORDERED: + # assert type(actual_data) in allowed_types_ordered - elif expected_type == TableMeta.TYPE_TEXT: - assert type(actual_data) in allowed_types_text + # elif expected_type == TableMeta.TYPE_TEXT: + # assert type(actual_data) in allowed_types_text - elif expected_type == TableMeta.TYPE_INTEGER: - assert type(actual_data) in allowed_types_int + # elif expected_type == TableMeta.TYPE_INTEGER: + # assert type(actual_data) in allowed_types_int - elif expected_type == TableMeta.TYPE_FLOAT: - assert type(actual_data) in allowed_types_float + # elif expected_type == TableMeta.TYPE_FLOAT: + # assert type(actual_data) in allowed_types_float - elif expected_type == TableMeta.TYPE_TIME: - assert type(actual_data) in allowed_types_time + # elif expected_type == TableMeta.TYPE_TIME: + # assert type(actual_data) in allowed_types_time - elif expected_type == TableMeta.TYPE_IDENTIFIER: - assert type(actual_data) in allowed_types_id + # elif expected_type == TableMeta.TYPE_IDENTIFIER: + # assert type(actual_data) in allowed_types_id - else: - logging.critical("check_type function received an unexpected type.") + # else: + # logging.critical("check_type function received an unexpected type.") def set_parameters(self, **parameters): for operation in self.operations: diff --git a/trane/core/prediction_problem_evaluator.py b/trane/core/prediction_problem_evaluator.py index 3dbdbf8b..2edbe560 100755 --- a/trane/core/prediction_problem_evaluator.py +++ b/trane/core/prediction_problem_evaluator.py @@ -7,8 +7,6 @@ from sklearn.preprocessing import OneHotEncoder from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor -from trane.utils.table_meta import TableMeta as TM - __all__ = ["PredictionProblemEvaluator"] @@ -104,37 +102,37 @@ def _categorical_threshold(self, df_col, k=3): counter_tuple = counter_tuple[:3] return [item[0] for item in counter_tuple] - def threshold_recommend(self, problem): - filter_op = problem.operations[0] - if len(filter_op.REQUIRED_PARAMETERS) == 0: - yield copy.deepcopy(problem), "no threshold" - else: - if filter_op.input_type == TM.TYPE_CATEGORY: - for item in self._categorical_threshold( - self.sampled_df[filter_op.column_name], - ): - problem_final = copy.deepcopy(problem) - problem_final.operations[0].set_hyper_parameter( - parameter_name="threshold", - parameter_value=item, - ) - yield problem_final, "threshold: {}".format(item) - elif filter_op.input_type in [TM.TYPE_FLOAT, TM.TYPE_INTEGER]: - for keep_rate in [0.25, 0.5, 0.75]: - threshold = filter_op.find_threshhold_by_remaining( - fraction_of_data_target=keep_rate, - df=self.sampled_df, - col=filter_op.column_name, - ) - problem_final = copy.deepcopy(problem) - problem_final.operations[0].set_hyper_parameter( - parameter_name="threshold", - parameter_value=threshold, - ) - yield problem_final, "threshold: {} (keep {}%)".format( - threshold, - keep_rate * 100, - ) + # def threshold_recommend(self, problem): + # filter_op = problem.operations[0] + # if len(filter_op.REQUIRED_PARAMETERS) == 0: + # yield copy.deepcopy(problem), "no threshold" + # else: + # if filter_op.input_type == TM.TYPE_CATEGORY: + # for item in self._categorical_threshold( + # self.sampled_df[filter_op.column_name], + # ): + # problem_final = copy.deepcopy(problem) + # problem_final.operations[0].set_hyper_parameter( + # parameter_name="threshold", + # parameter_value=item, + # ) + # yield problem_final, "threshold: {}".format(item) + # elif filter_op.input_type in [TM.TYPE_FLOAT, TM.TYPE_INTEGER]: + # for keep_rate in [0.25, 0.5, 0.75]: + # threshold = filter_op.find_threshhold_by_remaining( + # fraction_of_data_target=keep_rate, + # df=self.sampled_df, + # col=filter_op.column_name, + # ) + # problem_final = copy.deepcopy(problem) + # problem_final.operations[0].set_hyper_parameter( + # parameter_name="threshold", + # parameter_value=threshold, + # ) + # yield problem_final, "threshold: {} (keep {}%)".format( + # threshold, + # keep_rate * 100, + # ) def split_dataset(self, problem, problem_type, labels, features): X_train, X_test, Y_train, Y_test = [], [], [], [] @@ -191,12 +189,14 @@ def split_dataset(self, problem, problem_type, labels, features): return X_train, X_test, Y_train, Y_test def evaluate(self, problem, features, labels): - if problem.label_type in [TM.TYPE_INTEGER, TM.TYPE_FLOAT]: - problem_type = "regression" - elif problem.label_type in [TM.TYPE_CATEGORY, TM.TYPE_IDENTIFIER]: - problem_type = "classification" - else: - return {"status": "fail", "description": "unknown problem type"} + # totally wrong, just for testing + problem_type = "regression" + # if problem.label_type in [TM.TYPE_INTEGER, TM.TYPE_FLOAT]: + # problem_type = "regression" + # elif problem.label_type in [TM.TYPE_CATEGORY, TM.TYPE_IDENTIFIER]: + # problem_type = "classification" + # else: + # return {"status": "fail", "description": "unknown problem type"} template_res = {"problem_type": problem_type, "template_nl": str(problem)} evaluations = [] diff --git a/trane/core/prediction_problem_saver.py b/trane/core/prediction_problem_saver.py index 8400b5ee..87146c29 100755 --- a/trane/core/prediction_problem_saver.py +++ b/trane/core/prediction_problem_saver.py @@ -1,7 +1,6 @@ import json from trane.core.prediction_problem import PredictionProblem -from trane.utils.table_meta import TableMeta __all__ = ["prediction_problems_to_json_file", "prediction_problems_from_json_file"] @@ -80,13 +79,13 @@ def prediction_problems_from_json_file(filename): prediction_problems = [ PredictionProblem.from_json(json.dumps(prob)) for prob in prediction_problems ] - table_meta = TableMeta.from_json(json.dumps(data["table_meta"])) + # table_meta = TableMeta.from_json(json.dumps(data["table_meta"])) entity_id_column = data["entity_id_column"] label_generating_column = data["label_generating_column"] time_column = data["time_column"] return ( prediction_problems, - table_meta, + # table_meta, entity_id_column, label_generating_column, time_column, diff --git a/trane/datasets/__init__.py b/trane/datasets/__init__.py index b89425b0..9e1b9382 100644 --- a/trane/datasets/__init__.py +++ b/trane/datasets/__init__.py @@ -1,6 +1,6 @@ from trane.datasets.load_functions import ( load_covid, - load_covid_tablemeta, + load_covid_metadata, load_bike, load_bike_metadata, load_youtube, diff --git a/trane/datasets/load_functions.py b/trane/datasets/load_functions.py index af08aca9..826e78bc 100644 --- a/trane/datasets/load_functions.py +++ b/trane/datasets/load_functions.py @@ -1,8 +1,13 @@ import os import pandas as pd - -from trane.utils import TableMeta +from woodwork.column_schema import ColumnSchema +from woodwork.logical_types import ( + Categorical, + Datetime, + Double, + Integer, +) def load_covid(): @@ -53,77 +58,68 @@ def load_yelp(): return df -def load_covid_tablemeta(): - metadata = { - "tables": [ - { - "fields": [ - {"name": "Province/State", "type": "text"}, - {"name": "Country/Region", "type": "text"}, - {"name": "Lat", "type": "number", "subtype": "float"}, - {"name": "Long", "type": "number", "subtype": "float"}, - {"name": "Date", "type": "datetime"}, - {"name": "Confirmed", "type": "number", "subtype": "integer"}, - {"name": "Deaths", "type": "number", "subtype": "integer"}, - {"name": "Recovered", "type": "number", "subtype": "integer"}, - ], - }, - ], +def load_covid_metadata(): + table_meta = { + "Province/State": ColumnSchema( + logical_type=Categorical, + semantic_tags={"category"}, + ), + "Country/Region": ColumnSchema( + logical_type=Categorical, + semantic_tags={"category", "index"}, + ), + "Lat": ColumnSchema(logical_type=Double, semantic_tags={"numeric"}), + "Long": ColumnSchema(logical_type=Double, semantic_tags={"numeric"}), + "Date": ColumnSchema(logical_type=Datetime), + "Confirmed": ColumnSchema(logical_type=Integer, semantic_tags={"numeric"}), + "Deaths": ColumnSchema(logical_type=Integer, semantic_tags={"numeric"}), + "Recovered": ColumnSchema(logical_type=Integer, semantic_tags={"numeric"}), } - return TableMeta(metadata) + return table_meta def load_youtube_metadata(): - metadata = { - "tables": [ - { - "fields": [ - {"name": "trending_date", "type": "time"}, - {"name": "channel_title", "type": "id"}, - { - "name": "category_id", - "type": "categorical", - "subtype": "categorical", - }, - {"name": "views", "type": "categorical", "subtype": "number"}, - {"name": "likes", "type": "categorical", "subtype": "integer"}, - {"name": "dislikes", "type": "integer", "subtype": "number"}, - {"name": "comment_count", "type": "integer", "subtype": "number"}, - ], - }, - ], + table_meta = { + "trending_date": ColumnSchema(logical_type=Datetime), + "channel_title": ColumnSchema( + logical_type=Categorical, + semantic_tags={"index"}, + ), + "category_id": ColumnSchema( + logical_type=Categorical, + semantic_tags={"category", "index"}, + ), + "views": ColumnSchema(logical_type=Integer, semantic_tags={"numeric"}), + "likes": ColumnSchema(logical_type=Integer, semantic_tags={"numeric"}), + "dislikes": ColumnSchema(logical_type=Integer, semantic_tags={"numeric"}), + "comment_count": ColumnSchema(logical_type=Integer, semantic_tags={"numeric"}), } - return TableMeta(metadata) + return table_meta def load_bike_metadata(): - metadata = { - "tables": [ - { - "fields": [ - {"name": "date", "type": "time"}, - {"name": "hour", "subtype": "categorical", "type": "categorical"}, - { - "name": "usertype", - "subtype": "categorical", - "type": "categorical", - }, - {"name": "gender", "subtype": "categorical", "type": "categorical"}, - {"name": "tripduration", "subtype": "float", "type": "number"}, - {"name": "temperature", "subtype": "float", "type": "number"}, - {"name": "from_station_id", "type": "id"}, - { - "name": "dpcapacity_start", - "subtype": "integer", - "type": "number", - }, - {"name": "to_station_id", "type": "id"}, - {"name": "dpcapacity_end", "subtype": "integer", "type": "number"}, - ], - }, - ], + table_meta = { + "date": ColumnSchema(logical_type=Datetime), + "hour": ColumnSchema(logical_type=Categorical, semantic_tags={"category"}), + "usertype": ColumnSchema(logical_type=Categorical, semantic_tags={"category"}), + "gender": ColumnSchema(logical_type=Categorical, semantic_tags={"category"}), + "tripduration": ColumnSchema(logical_type=Double, semantic_tags={"numeric"}), + "temperature": ColumnSchema(logical_type=Double, semantic_tags={"numeric"}), + "from_station_id": ColumnSchema( + logical_type=Categorical, + semantic_tags={"index"}, + ), + "dpcapacity_start": ColumnSchema( + logical_type=Integer, + semantic_tags={"numeric"}, + ), + "to_station_id": ColumnSchema( + logical_type=Categorical, + semantic_tags={"index"}, + ), + "dpcapacity_end": ColumnSchema(logical_type=Integer, semantic_tags={"numeric"}), } - return TableMeta(metadata) + return table_meta def generate_local_filepath(key): diff --git a/trane/utils/__init__.py b/trane/utils/__init__.py index 23083344..98f5e92d 100755 --- a/trane/utils/__init__.py +++ b/trane/utils/__init__.py @@ -6,7 +6,7 @@ _solve_evaluation, multi_process_evaluation, multiprocess_prediction_problem, - overall_prediction_helper, + # overall_prediction_helper, execute_prediction_problems, ) from trane.utils.evaluate_tool import * # noqa diff --git a/trane/utils/data_parser.py b/trane/utils/data_parser.py index daae4c02..d166011c 100644 --- a/trane/utils/data_parser.py +++ b/trane/utils/data_parser.py @@ -1,10 +1,6 @@ -from datetime import datetime - import pandas as pd -from trane.utils.table_meta import TableMeta as TM - -__all__ = ["denormalize", "parse_data"] +__all__ = ["denormalize"] class CsvMerge: @@ -87,29 +83,3 @@ def denormalize(relationships): assert len(csv_merge_objs) == 1 return csv_merge_objs[0].get_data() - - -def parse_data(dataframe, table_meta): - """ - Convert columns specified as time in the table_meta from str objects to datetime objects. - - Parameters - ---------- - dataframe: the data - table_meta: a TableMeta object specifying meta information about the data - - Returns - ---------- - dataframe: with time columns converted from str to datetime. - """ - - columns = table_meta.get_columns() - for column in columns: - if table_meta.get_type(column) == TM.TYPE_TIME: - dataframe[column] = dataframe[column].apply( - lambda x: datetime.strptime( - x, - table_meta.get_property(column, "format"), - ), - ) - return dataframe diff --git a/trane/utils/helper.py b/trane/utils/helper.py index 1fb404ae..4c14ce58 100755 --- a/trane/utils/helper.py +++ b/trane/utils/helper.py @@ -4,13 +4,10 @@ from tqdm.contrib.concurrent import process_map from tqdm.notebook import tqdm -from trane.utils.table_meta import TableMeta as TM - - -def overall_prediction_helper(df, meta): - df["__fake_root_entity__"] = 0 - meta.add_column("__fake_root_entity__", TM.TYPE_IDENTIFIER) - return df, meta +# def overall_prediction_helper(df, meta): +# df["__fake_root_entity__"] = 0 +# meta.add_column("__fake_root_entity__", TM.TYPE_IDENTIFIER) +# return df, meta def execute_prediction_problems(df, problems):