From 7b8ebc81340db12e465fb8b9bbe1e670f05b7772 Mon Sep 17 00:00:00 2001 From: "Md. Khairul Islam" Date: Wed, 6 Sep 2023 23:22:59 -0400 Subject: [PATCH] add multi-timeseries dataset --- data_provider/base.py | 209 ------ data_provider/data_factory.py | 28 +- data_provider/data_loader.py | 176 ++++- data_provider/electricity.py | 114 ---- data_provider/traffic.py | 205 ------ download_data.py | 632 ------------------ exp/exp_classification.py | 18 +- exp/exp_long_term_forecasting.py | 51 +- exp/exp_main.py | 318 --------- models/Transformer.py | 2 +- result_long_term_forecast.txt | 12 + .../metrics.npy | Bin 0 -> 148 bytes .../pred.npy | Bin 0 -> 1248 bytes .../true.npy | Bin 0 -> 1248 bytes run.py | 14 +- scripts/Covid/Transformer.sh | 1 + scripts/ILI_script/Transformer_windows.sh | 2 +- .../0.pdf | Bin 0 -> 10623 bytes 18 files changed, 237 insertions(+), 1545 deletions(-) delete mode 100644 data_provider/base.py delete mode 100644 data_provider/electricity.py delete mode 100644 data_provider/traffic.py delete mode 100644 download_data.py delete mode 100644 exp/exp_main.py create mode 100644 results/long_term_forecast_covid_14_14_Transformer_covid_ftMS_sl14_ll7_pl14_dm512_nh8_el2_dl1_df2048_fc3_ebtimeF_dtTrue_Exp_0/metrics.npy create mode 100644 results/long_term_forecast_covid_14_14_Transformer_covid_ftMS_sl14_ll7_pl14_dm512_nh8_el2_dl1_df2048_fc3_ebtimeF_dtTrue_Exp_0/pred.npy create mode 100644 results/long_term_forecast_covid_14_14_Transformer_covid_ftMS_sl14_ll7_pl14_dm512_nh8_el2_dl1_df2048_fc3_ebtimeF_dtTrue_Exp_0/true.npy create mode 100644 scripts/Covid/Transformer.sh create mode 100644 test_results/long_term_forecast_covid_14_14_Transformer_covid_ftMS_sl14_ll7_pl14_dm512_nh8_el2_dl1_df2048_fc3_ebtimeF_dtTrue_Exp_0/0.pdf diff --git a/data_provider/base.py b/data_provider/base.py deleted file mode 100644 index 151fb78..0000000 --- a/data_provider/base.py +++ /dev/null @@ -1,209 +0,0 @@ -import os -from abc import ABC, abstractmethod, abstractproperty -from typing import Union, List -from enum import Enum -import pandas as pd -from pandas import read_csv, to_datetime -import numpy as np -from utils.download import * -from tqdm import tqdm - -DISABLE_PROGRESS = False - -# Type defintions -class DataTypes(str, Enum): - """Defines numerical types of each column.""" - INTEGER = 'int' - FLOAT = 'float' - CATEGORICAL = 'categorical' - DATE = 'date' - - def __str__(self) -> str: - return super().__str__() - -class InputTypes(str, Enum): - """ - Defines input types of each column. - - TARGET: Prediction target - OBSERVED: Past dynamic inputs - KNOWN: Known future values - STATIC: Static values - ID: Single column used as an entity identifier - TIME: Single column exclusively used as a time index - """ - TARGET = 'target' - OBSERVED = 'observed' - KNOWN = 'known' - STATIC = 'static' - - #TODO: add full support for multiple columns - ID = 'id' - TIME = 'time' - - def __str__(self) -> str: - return super().__str__() - -class BaseDataFormatter(ABC): - """ - Abstract base class for all data formatters. - - User can implement the abstract methods below to - perform dataset-specific - manipulations. - """ - - data_root = 'datasets' - """Root directory of all datasets""" - - def __init__(self, data_folder:str = '') -> None: - super().__init__() - - self.data_folder = os.path.join(self.data_root, data_folder) - os.makedirs(self.data_folder,exist_ok=True) - - """Directory for input files, a subdir of the data_root""" - - def fix_column_types( - self, df:pd.DataFrame - ) -> pd.DataFrame: - - print('Feature column, Data type, Current type') - for item in self.column_definition: - key, data_type = item[0], item[1] - print(key, data_type, df[key].dtype.name) - - if data_type == DataTypes.CATEGORICAL: - df[key] = df[key].astype(str) - elif data_type == DataTypes.DATE: - df[key] = df[key].apply(to_datetime) - elif data_type == DataTypes.INTEGER: - df[key] = df[key].astype(int) - elif data_type == DataTypes.FLOAT: - df[key] = df[key].astype(float) - - print(df.dtypes) - return df - - def load(self) -> pd.DataFrame: - print(f'Loading {self.data_path}') - - if not os.path.exists(self.data_path): - print(f'{self.data_path} not found.') - df = self.download() - - df = read_csv(self.data_path) - return self.fix_column_types(df) - - @property - @abstractmethod - def data_path(self): - raise NotImplementedError() - - @abstractmethod - def download(self, force=False) -> None: - """Downloads the target file, preprocesses and dumps in the data folder. - Temporary files generated during the download are removed afterwards. - - Args: - force (bool, optional): Force update current file. Defaults to False. - - Raises: - NotImplementedError - """ - raise NotImplementedError() - - @property - @abstractmethod - def column_definition(self) -> list[tuple[Union[str, int], DataTypes, Union[InputTypes, list[InputTypes]]]]: - """ - Defines feature, input type and data type of each column. - It is a list of tuples of the format (feature_name, data_type, input_type) - or (feature_name, data_type, list of input_types) - """ - # https://www.geeksforgeeks.org/extract-multidict-values-to-a-list-in-python/ - raise NotImplementedError() - - @property - def targets(self): return self.extract_columns(input_type=InputTypes.TARGET) - - #TODO: Add support for multiple group ids (e.g. in the prediction processor) - @property - def group_id(self): - """ - Return the group id column where each id value represents one timeseries. - """ - return self.extract_columns(input_type=InputTypes.ID) - - @property - def time_index(self): return self.extract_columns(input_type=InputTypes.TIME) - - @property - @abstractmethod - def parameters(self): - """Defines the fixed parameters used by the model for training. - - Returns: - A dictionary of fixed parameters, e.g.: - - parameters = { - 'window': 1, # Length of input time sequence (past observations) - 'horizon': 1, # Length of output - 'num_epochs': 10, - 'early_stopping_patience': 5, - 'multiprocessing_workers': 1, - } - """ - raise NotImplementedError() - - @abstractmethod - def split(self, data, train_start, val_start, test_start, test_end): - """Performs the default train, validation and test splits. - - Args: - df: Source data frame to split. - valid_boundary: Starting year for validation data - test_boundary: Starting year for test data - - Returns: - Tuple of transformed (train, valid, test) data. - """ - raise NotImplementedError() - - def extract_columns( - self, data_type:Union[DataTypes, List[DataTypes]] = None, - input_type:Union[InputTypes, List[InputTypes]] = None - )-> List[str]: - """Extracts the names of columns that correspond to a define data_type. - - Args: - definition: Column definition to use. - data_type: DataType of columns to extract. - input_type: InputType of columns to extract. - - Returns: - Name or a list of names for columns with data and input type specified. - """ - # print(f'\nExtracting data type {data_type}, input type {input_type}.') - columns = [] - for item in self.column_definition: - if data_type is not None: - if isinstance(data_type, list): - found = [d for d in data_type if d in item] - if len(found) == 0: continue - elif data_type not in item: continue - - if input_type is not None: - if isinstance(input_type, list): - found = [d for d in input_type if d in item] - if len(found) == 0: continue - - elif input_type not in item: continue - - columns.append(item[0]) - - # print(f'Extracted columns {columns}.\n') - # if len(columns)==1: - # return columns[0] - - return columns \ No newline at end of file diff --git a/data_provider/data_factory.py b/data_provider/data_factory.py index 3cdb70b..ec01906 100644 --- a/data_provider/data_factory.py +++ b/data_provider/data_factory.py @@ -1,31 +1,24 @@ -from data_provider.data_loader import Dataset_Custom, Dataset_Pred +from data_provider.data_loader import Dataset_Custom, Dataset_Pred, MultiTimeSeries from torch.utils.data import DataLoader data_dict = { 'custom': Dataset_Custom, + 'covid': MultiTimeSeries } def data_provider(args, flag): - Data = data_dict[args.data] timeenc = 0 if args.embed != 'timeF' else 1 - - if flag == 'test': - shuffle_flag = False - drop_last = False - batch_size = args.batch_size - freq = args.freq - elif flag == 'pred': - shuffle_flag = False - drop_last = False - batch_size = 1 - freq = args.freq + if flag == 'pred': Data = Dataset_Pred + batch_size = 1 else: - shuffle_flag = True - drop_last = True + Data = data_dict[args.data] batch_size = args.batch_size - freq = args.freq + + drop_last = flag == 'train' + shuffle_flag = flag == 'train' + freq = args.freq data_set = Data( root_path=args.root_path, @@ -35,7 +28,8 @@ def data_provider(args, flag): features=args.features, target=args.target, timeenc=timeenc, - freq=freq + freq=freq, + scale=not args.no_scale ) print(flag, len(data_set)) data_loader = DataLoader( diff --git a/data_provider/data_loader.py b/data_provider/data_loader.py index 645519c..2152f05 100644 --- a/data_provider/data_loader.py +++ b/data_provider/data_loader.py @@ -1,9 +1,8 @@ import os -import numpy as np import pandas as pd +import numpy as np import os -import torch -from torch.utils.data import Dataset, DataLoader +from torch.utils.data import Dataset from sklearn.preprocessing import StandardScaler from utils.timefeatures import time_features import warnings @@ -60,12 +59,14 @@ def __read_data__(self): border1 = border1s[self.set_type] border2 = border2s[self.set_type] + # choose input data based on Multivariate or Univariate setting if self.features == 'M' or self.features == 'MS': cols_data = df_raw.columns[1:] df_data = df_raw[cols_data] elif self.features == 'S': df_data = df_raw[[self.target]] + # scale data if self.scale: train_data = df_data[border1s[0]:border2s[0]] self.scaler.fit(train_data.values) @@ -73,6 +74,7 @@ def __read_data__(self): else: data = df_data.values + # add time encoding df_stamp = df_raw[['date']][border1:border2] df_stamp['date'] = pd.to_datetime(df_stamp.date) if self.timeenc == 0: @@ -85,6 +87,7 @@ def __read_data__(self): data_stamp = time_features(pd.to_datetime(df_stamp['date'].values), freq=self.freq) data_stamp = data_stamp.transpose(1, 0) + # select data split self.data_x = data[border1:border2] self.data_y = data[border1:border2] self.data_stamp = data_stamp @@ -106,7 +109,9 @@ def __len__(self): return len(self.data_x) - self.seq_len - self.pred_len + 1 def inverse_transform(self, data): - return self.scaler.inverse_transform(data) + if self.scale: + return self.scaler.inverse_transform(data) + return data class Dataset_Pred(Dataset): @@ -212,4 +217,165 @@ def __len__(self): return len(self.data_x) - self.seq_len + 1 def inverse_transform(self, data): - return self.scaler.inverse_transform(data) + if self.scale: + return self.scaler.inverse_transform(data) + return data + + +class MultiTimeSeries(Dataset): + def __init__( + self, root_path, flag='train', size=None, + features='S', data_path='ETTh1.csv', + target='OT', scale=True, timeenc=0, freq='d', + time_col='Date', id_col='FIPS', max_samples=-1 + ): + # size [seq_len, label_len, pred_len] + # info + if size == None: + self.seq_len = 24 * 4 * 4 + self.label_len = 24 * 4 + self.pred_len = 24 * 4 + else: + self.seq_len = size[0] + self.label_len = size[1] + self.pred_len = size[2] + + # init + assert flag in ['train', 'test', 'val'] + type_map = {'train': 0, 'val': 1, 'test': 2} + self.set_type = type_map[flag] + + self.features = features + self.target = target + self.scale = scale + self.timeenc = timeenc + self.freq = freq + + self.root_path = root_path + self.data_path = data_path + + self.id_col = id_col + self.time_col = time_col + self.time_steps = self.seq_len + self.pred_len + self.max_samples = max_samples + self.scaler = StandardScaler() + self.__read_data__() + + def __read_data__(self): + df_raw = pd.read_csv(os.path.join(self.root_path,self.data_path)) + df_raw[self.time_col] = pd.to_datetime(df_raw[self.time_col]) + + id_col, time_col, target, time_steps = self.id_col, self.time_col, self.target, self.time_steps + df_raw.sort_values(by=time_col, inplace=True) + input_cols = [ + col for col in df_raw.columns \ + if col not in [id_col, time_col, target] + ] + + dates = df_raw[time_col].unique() + num_total = len(dates) + num_test = self.pred_len # int(len(dates) * 0.2) + num_vali = self.pred_len # num_total - num_train - num_test + num_train = num_total - num_test - num_vali# int(len(dates) * 0.7) + + border1s = [0, num_train - self.seq_len, num_total - num_test - self.seq_len] + border2s = [num_train, num_train + num_vali, num_total] + # border1 = border1s[self.set_type] + # border2 = border2s[self.set_type] + + border1 = dates[border1s[self.set_type]] + border2 = dates[border2s[self.set_type]-1] + border1 = df_raw[time_col].values.searchsorted(border1, side='left') + border2 = df_raw[time_col].values.searchsorted(border2, side='right') + + # get input features + if self.features == 'M' or self.features == 'MS': + selected_columns = input_cols+[target] + elif self.features == 'S': + selected_columns = [target] + print('Selected columns ', selected_columns) + self.selected_columns = selected_columns + + df_data = df_raw[border1:border2].copy().reset_index(drop=True) + + if self.scale: + train_end = df_raw[time_col].values.searchsorted( + dates[border2s[0]-1], side='right' + ) + train_data = df_raw[0:train_end] + self.scaler.fit(train_data[selected_columns]) + df_data.loc[:, selected_columns] = self.scaler.transform(df_data[selected_columns]) + + # add time encoding + data_stamp = self._add_time_features(df_data.loc[0, [self.time_col]]) + time_encoded_columns = data_stamp.shape[1] + print('Number of time encoded columns :', time_encoded_columns) + + print('Getting valid sampling locations.') + + valid_sampling_locations = [] + split_data_map = {} + for identifier, df in df_data.groupby(id_col): + num_entries = len(df) + if num_entries >= time_steps: + valid_sampling_locations += [ + (identifier, i) + for i in range(num_entries - time_steps + 1) + ] + split_data_map[identifier] = df + + max_samples = self.max_samples # -1 takes all samples + + if max_samples > 0 and len(valid_sampling_locations) > max_samples: + print('Extracting {} samples...'.format(max_samples)) + ranges = [valid_sampling_locations[i] for i in np.random.choice( + len(valid_sampling_locations), max_samples, replace=False)] + else: + # print('Max samples={} exceeds # available segments={}'.format( + # max_samples, len(valid_sampling_locations))) + ranges = valid_sampling_locations + max_samples = len(valid_sampling_locations) + + self.data = np.zeros((max_samples, self.time_steps, len(selected_columns))) + self.data_stamp = np.zeros((max_samples, self.time_steps, time_encoded_columns)) + for i, tup in enumerate(ranges): + if ((i + 1) % 10000) == 0: + print(i + 1, 'of', max_samples, 'samples done...') + identifier, start_idx = tup + sliced = split_data_map[identifier].iloc[start_idx:start_idx + time_steps] + self.data[i, :, :] = sliced[selected_columns] + self.data_stamp[i, :, :] = self._add_time_features(sliced[[self.time_col]]) + + def __getitem__(self, index): + s_end = self.seq_len + r_begin = s_end - self.label_len + r_end = r_begin + self.label_len + self.pred_len + + seq_x = self.data[index][:s_end] + seq_y = self.data[index][r_begin:r_end] + + seq_x_mark = self.data_stamp[index][:s_end] + seq_y_mark = self.data_stamp[index][r_begin:r_end] + return seq_x, seq_y, seq_x_mark, seq_y_mark + + def _add_time_features(self, df): + df_stamp = pd.DataFrame() + df_stamp['date'] = pd.to_datetime(df[self.time_col]) + if self.timeenc == 0: + df_stamp['month'] = df_stamp.date.apply(lambda row: row.month, 1) + df_stamp['day'] = df_stamp.date.apply(lambda row: row.day, 1) + df_stamp['weekday'] = df_stamp.date.apply(lambda row: row.weekday(), 1) + df_stamp['hour'] = df_stamp.date.apply(lambda row: row.hour, 1) + data_stamp = df_stamp.drop(['date'], 1).values + elif self.timeenc == 1: + data_stamp = time_features(pd.to_datetime(df_stamp['date'].values), freq=self.freq) + data_stamp = data_stamp.transpose(1, 0) + return data_stamp + + def __len__(self): + return len(self.data) # - self.seq_len - self.pred_len + 1 + + def inverse_transform(self, data): + if self.scale: + return self.scaler.inverse_transform(data) + return data \ No newline at end of file diff --git a/data_provider/electricity.py b/data_provider/electricity.py deleted file mode 100644 index 22c067c..0000000 --- a/data_provider/electricity.py +++ /dev/null @@ -1,114 +0,0 @@ -from data_provider.base import * -from pandas import DataFrame, to_datetime - -class ElectricityFormatter(BaseDataFormatter): - - def __init__(self) -> None: - super().__init__() - self.data_folder = os.path.join(self.data_root, 'electricity') - - @property - def data_path(self): - return os.path.join(self.data_folder, 'hourly_electricity.csv') - - @property - def column_definition(self): - return [ - ('id', DataTypes.INTEGER, InputTypes.ID), - ('hours_from_start', DataTypes.INTEGER, InputTypes.TIME, InputTypes.KNOWN), - ('power_usage', DataTypes.FLOAT, InputTypes.TARGET), - ('hour', DataTypes.INTEGER, InputTypes.KNOWN), - ('day_of_week', DataTypes.INTEGER, InputTypes.KNOWN), - ('categorical_id', DataTypes.CATEGORICAL, InputTypes.STATIC), - ] - - @property - def parameters(self): - return { - "window": 7 * 24, # lag times hours - "horizon": 24 - } - - def split(self, data:DataFrame, val_start=1315, test_start=1339): - # this is done following Google's TFT paper - # note that this is different from time index - index = data['days_from_start'] - lags = 7 - - train = data.loc[index < val_start].reset_index(drop=True) - validation = data.loc[ - (index >= (val_start - lags)) & (index < test_start) - ].reset_index(drop=True) - test = data.loc[index >= (test_start-lags)].reset_index(drop=True) - - return train, validation, test - - def download( - self, force=False, start='2014-01-01', end='2014-09-01' - ) -> None: - """Downloads electricity dataset from UCI repository.""" - - if os.path.exists(self.data_path) and not force: - return - - if force: print('Force updating current data.') - - url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00321/LD2011_2014.txt.zip' - - csv_path = os.path.join(self.data_folder, 'LD2011_2014.txt') - zip_path = csv_path + '.zip' - - download_and_unzip(url, zip_path, csv_path, self.data_folder) - - print('Aggregating to hourly data') - - df = pd.read_csv(csv_path, index_col=0, sep=';', decimal=',') - df.index = pd.to_datetime(df.index) - df.sort_index(inplace=True) - - # Filter to match range used by other academic papers - start = pd.to_datetime(start) - end = pd.to_datetime(end) - df = df[(df.index >= start) & (df.index <=end)] - print(f'Filtering out data outside {start} and {end}') - - # Used to determine the start and end dates of a series - output = df.resample('1h').sum().fillna(0) - - earliest_time = output.index.min() - - df_list = [] - for label in tqdm(output, total=output.shape[1]): - # print('Processing {}'.format(label)) - srs = output[label] - - start_date = min(srs.fillna(method='ffill').dropna().index) - end_date = max(srs.fillna(method='bfill').dropna().index) - - active_range = (srs.index >= start_date) & (srs.index <= end_date) - srs = srs[active_range].fillna(0) - - tmp = pd.DataFrame({'power_usage': srs}) - date = tmp.index - tmp['hours_from_start'] = (date - earliest_time).seconds / 60 / 60 + ( - date - earliest_time).days * 24 - tmp['days_from_start'] = (date - earliest_time).days - tmp['date'] = date - tmp['id'] = label - tmp['hour'] = date.hour - tmp['day'] = date.day - tmp['day_of_week'] = date.dayofweek - tmp['month'] = date.month - - df_list.append(tmp) - - output = pd.concat(df_list, axis=0, join='outer').reset_index(drop=True) - - # Filter to match range used by other academic papers - # output = output[(output['days_from_start'] >= 1096) - # & (output['days_from_start'] < 1346)].copy() - - output.to_csv(self.data_path, index=False) - cleanup(self.data_folder, self.data_path) - - print('Done.') diff --git a/data_provider/traffic.py b/data_provider/traffic.py deleted file mode 100644 index 153d45a..0000000 --- a/data_provider/traffic.py +++ /dev/null @@ -1,205 +0,0 @@ -from data_provider.base import * -from pandas import DataFrame - -class TrafficFormatter(BaseDataFormatter): - def __init__(self) -> None: - super('traffic').__init__() - - @property - def data_path(self): - return os.path.join(self.data_folder, 'hourly_traffic.csv') - - @property - def column_definition(self) -> dict: - return [ - ('id', DataTypes.INTEGER, InputTypes.ID), - ('hours_from_start', DataTypes.INTEGER, InputTypes.TIME, InputTypes.KNOWN), - ('values', DataTypes.FLOAT, InputTypes.TARGET), - ('time_on_day', DataTypes.INTEGER, InputTypes.KNOWN), - ('day_of_week', DataTypes.INTEGER, InputTypes.KNOWN), - ('categorical_id', DataTypes.CATEGORICAL, InputTypes.STATIC), - ] - - @property - def parameters(self) -> dict: - return { - "window": 7 * 24, - "horizon": 24 - } - - def split(self, df, valid_boundary=151, test_boundary=166): - """Splits data frame into training-validation-test data frames. - - This also calibrates scaling object, and transforms data for each split. - - Args: - df: Source data frame to split. - valid_boundary: Starting day for validation data - test_boundary: Starting day for test data - - Returns: - Tuple of transformed (train, valid, test) data. - """ - - print('Formatting train-valid-test splits.') - - index = df['sensor_day'] - train = df.loc[index < valid_boundary] - validation = df.loc[(index >= valid_boundary - 7) & (index < test_boundary)] - test = df.loc[index >= test_boundary - 7] - - return train, validation, test - - def download(self, force=False) -> None: - """Downloads traffic dataset from UCI repository.""" - if os.path.exists(self.data_path) and not force: - return - - if force: print('Force updating current data.') - - url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00204/PEMS-SF.zip' - - data_folder = self.data_folder - csv_path = os.path.join(data_folder, 'PEMS_train') - zip_path = os.path.join(data_folder, 'PEMS-SF.zip') - - download_and_unzip(url, zip_path, csv_path, data_folder) - - print('Aggregating to hourly data') - - def process_list(s, variable_type=int, delimiter=None): - """Parses a line in the PEMS format to a list.""" - if delimiter is None: - l = [ - variable_type(i) for i in s.replace('[', '').replace(']', '').split() - ] - else: - l = [ - variable_type(i) - for i in s.replace('[', '').replace(']', '').split(delimiter) - ] - - return l - - def read_single_list(filename): - """Returns single list from a file in the PEMS-custom format.""" - with open(os.path.join(data_folder, filename), 'r') as dat: - l = process_list(dat.readlines()[0]) - return l - - def read_matrix(filename): - """Returns a matrix from a file in the PEMS-custom format.""" - array_list = [] - with open(os.path.join(data_folder, filename), 'r') as dat: - lines = dat.readlines() - for i, line in tqdm(enumerate(lines), disable=DISABLE_PROGRESS): - # if (i + 1) % 50 == 0: - # print('Completed {} of {} rows for {}'.format(i + 1, len(lines),filename)) - - array = [ - process_list(row_split, variable_type=float, delimiter=None) - for row_split in process_list( - line, variable_type=str, delimiter=';') - ] - array_list.append(array) - - return array_list - - shuffle_order = np.array(read_single_list('randperm')) - 1 # index from 0 - train_dayofweek = read_single_list('PEMS_trainlabels') - train_tensor = read_matrix('PEMS_train') - test_dayofweek = read_single_list('PEMS_testlabels') - test_tensor = read_matrix('PEMS_test') - - # Inverse permutate shuffle order - print('Shuffling') - inverse_mapping = { - new_location: previous_location - for previous_location, new_location in enumerate(shuffle_order) - } - reverse_shuffle_order = np.array([ - inverse_mapping[new_location] - for new_location, _ in enumerate(shuffle_order) - ]) - - # Group and reoder based on permuation matrix - print('Reodering') - day_of_week = np.array(train_dayofweek + test_dayofweek) - combined_tensor = np.array(train_tensor + test_tensor) - - day_of_week = day_of_week[reverse_shuffle_order] - combined_tensor = combined_tensor[reverse_shuffle_order] - - # Put everything back into a dataframe - print('Parsing as dataframe') - labels = ['traj_{}'.format(i) for i in read_single_list('stations_list')] - - hourly_list = [] - for day, day_matrix in enumerate(combined_tensor): - - # Hourly data - hourly = pd.DataFrame(day_matrix.T, columns=labels) - hourly['hour_on_day'] = [int(i / 6) for i in hourly.index - ] # sampled at 10 min intervals - if hourly['hour_on_day'].max() > 23 or hourly['hour_on_day'].min() < 0: - raise ValueError('Invalid hour! {}-{}'.format( - hourly['hour_on_day'].min(), hourly['hour_on_day'].max())) - - hourly = hourly.groupby('hour_on_day', as_index=True).mean()[labels] - hourly['sensor_day'] = day - hourly['time_on_day'] = hourly.index - hourly['day_of_week'] = day_of_week[day] - - hourly_list.append(hourly) - - hourly_frame = pd.concat(hourly_list, axis=0, ignore_index=True, sort=False) - - # Flatten such that each entitiy uses one row in dataframe - store_columns = [c for c in hourly_frame.columns if 'traj' in c] - other_columns = [c for c in hourly_frame.columns if 'traj' not in c] - flat_df = pd.DataFrame(columns=['values', 'prev_values', 'next_values'] + - other_columns + ['id']) - - def format_index_string(x): - """Returns formatted string for key.""" - - if x < 10: - return '00' + str(x) - elif x < 100: - return '0' + str(x) - elif x < 1000: - return str(x) - - raise ValueError('Invalid value of x {}'.format(x)) - - for store in tqdm(store_columns, disable=DISABLE_PROGRESS): - # print('Processing {}'.format(store)) - - sliced = hourly_frame[[store] + other_columns].copy() - sliced.columns = ['values'] + other_columns - sliced['id'] = int(store.replace('traj_', '')) - - # Sort by Sensor-date-time - key = sliced['id'].apply(str) \ - + sliced['sensor_day'].apply(lambda x: '_' + format_index_string(x)) \ - + sliced['time_on_day'].apply(lambda x: '_' + format_index_string(x)) - sliced = sliced.set_index(key).sort_index() - - sliced['values'] = sliced['values'].fillna(method='ffill') - sliced['prev_values'] = sliced['values'].shift(1) - sliced['next_values'] = sliced['values'].shift(-1) - - flat_df = pd.concat([flat_df, sliced.dropna()], ignore_index=True, sort=False) - - # Filter to match range used by other academic papers - index = flat_df['sensor_day'] - flat_df = flat_df[index < 173].copy() - - # Creating columns fo categorical inputs - flat_df['categorical_id'] = flat_df['id'].copy() - flat_df['hours_from_start'] = flat_df['time_on_day'] \ - + flat_df['sensor_day']*24 - - flat_df.to_csv(self.data_path, index=False) - cleanup(self.data_folder, self.data_path) - print('Done.') diff --git a/download_data.py b/download_data.py deleted file mode 100644 index 5519f7d..0000000 --- a/download_data.py +++ /dev/null @@ -1,632 +0,0 @@ -"""Script to download data for a default experiment. - -Only downloads data if the csv files are present, unless the "force_download" -argument is supplied. For new datasets, the download_and_unzip(.) can be reused -to pull csv files from an online repository, but may require subsequent -dataset-specific processing. - -Usage: - python3 script_download_data {EXPT_NAME} {OUTPUT_FOLDER} {FORCE_DOWNLOAD} - -Command line args: - EXPT_NAME: Name of experiment to download data for {e.g. electricity} - OUTPUT_FOLDER: Path to folder in which - FORCE_DOWNLOAD: Whether to force data download from scratch. - - - -""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import argparse - -import gc -import glob -import os -import shutil -import sys - -from expt_settings.configs import ExperimentConfig -import numpy as np -import pandas as pd -import pyunpack -import wget - - -# General functions for data downloading & aggregation. -def download_from_url(url, output_path): - """Downloads a file froma url.""" - - print('Pulling data from {} to {}'.format(url, output_path)) - wget.download(url, output_path) - print('done') - - -def recreate_folder(path): - """Deletes and recreates folder.""" - - shutil.rmtree(path) - os.makedirs(path) - - -def unzip(zip_path, output_file, data_folder): - """Unzips files and checks successful completion.""" - - print('Unzipping file: {}'.format(zip_path)) - pyunpack.Archive(zip_path).extractall(data_folder) - - # Checks if unzip was successful - if not os.path.exists(output_file): - raise ValueError( - 'Error in unzipping process! {} not found.'.format(output_file)) - - -def download_and_unzip(url, zip_path, csv_path, data_folder): - """Downloads and unzips an online csv file. - - Args: - url: Web address - zip_path: Path to download zip file - csv_path: Expected path to csv file - data_folder: Folder in which data is stored. - """ - - download_from_url(url, zip_path) - - unzip(zip_path, csv_path, data_folder) - - print('Done.') - - -# Dataset specific download routines. -def download_volatility(config): - """Downloads volatility data from OMI website.""" - - url = 'https://realized.oxford-man.ox.ac.uk/images/oxfordmanrealizedvolatilityindices.zip' - - data_folder = config.data_folder - csv_path = os.path.join(data_folder, 'oxfordmanrealizedvolatilityindices.csv') - zip_path = os.path.join(data_folder, 'oxfordmanrealizedvolatilityindices.zip') - - download_and_unzip(url, zip_path, csv_path, data_folder) - - print('Unzip complete. Adding extra inputs') - - df = pd.read_csv(csv_path, index_col=0) # no explicit index - - # Adds additional date/day fields - idx = [str(s).split('+')[0] for s in df.index - ] # ignore timezones, we don't need them - dates = pd.to_datetime(idx) - df['date'] = dates - df['days_from_start'] = (dates - pd.datetime(2000, 1, 3)).days - df['day_of_week'] = dates.dayofweek - df['day_of_month'] = dates.day - df['week_of_year'] = dates.weekofyear - df['month'] = dates.month - df['year'] = dates.year - df['categorical_id'] = df['Symbol'].copy() - - # Processes log volatility - vol = df['rv5_ss'].copy() - vol.loc[vol == 0.] = np.nan - df['log_vol'] = np.log(vol) - - # Adds static information - symbol_region_mapping = { - '.AEX': 'EMEA', - '.AORD': 'APAC', - '.BFX': 'EMEA', - '.BSESN': 'APAC', - '.BVLG': 'EMEA', - '.BVSP': 'AMER', - '.DJI': 'AMER', - '.FCHI': 'EMEA', - '.FTMIB': 'EMEA', - '.FTSE': 'EMEA', - '.GDAXI': 'EMEA', - '.GSPTSE': 'AMER', - '.HSI': 'APAC', - '.IBEX': 'EMEA', - '.IXIC': 'AMER', - '.KS11': 'APAC', - '.KSE': 'APAC', - '.MXX': 'AMER', - '.N225': 'APAC ', - '.NSEI': 'APAC', - '.OMXC20': 'EMEA', - '.OMXHPI': 'EMEA', - '.OMXSPI': 'EMEA', - '.OSEAX': 'EMEA', - '.RUT': 'EMEA', - '.SMSI': 'EMEA', - '.SPX': 'AMER', - '.SSEC': 'APAC', - '.SSMI': 'EMEA', - '.STI': 'APAC', - '.STOXX50E': 'EMEA' - } - - df['Region'] = df['Symbol'].apply(lambda k: symbol_region_mapping[k]) - - # Performs final processing - output_df_list = [] - for grp in df.groupby('Symbol'): - sliced = grp[1].copy() - sliced.sort_values('days_from_start', inplace=True) - # Impute log volatility values - sliced['log_vol'].fillna(method='ffill', inplace=True) - sliced.dropna() - output_df_list.append(sliced) - - df = pd.concat(output_df_list, axis=0) - - output_file = config.data_csv_path - print('Completed formatting, saving to {}'.format(output_file)) - df.to_csv(output_file) - - print('Done.') - - -def download_electricity(config): - """Downloads electricity dataset from UCI repository.""" - - url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00321/LD2011_2014.txt.zip' - - data_folder = config.data_folder - csv_path = os.path.join(data_folder, 'LD2011_2014.txt') - zip_path = csv_path + '.zip' - - download_and_unzip(url, zip_path, csv_path, data_folder) - - print('Aggregating to hourly data') - - df = pd.read_csv(csv_path, index_col=0, sep=';', decimal=',') - df.index = pd.to_datetime(df.index) - df.sort_index(inplace=True) - - # Used to determine the start and end dates of a series - output = df.resample('1h').mean().replace(0., np.nan) - - earliest_time = output.index.min() - - df_list = [] - for label in output: - print('Processing {}'.format(label)) - srs = output[label] - - start_date = min(srs.fillna(method='ffill').dropna().index) - end_date = max(srs.fillna(method='bfill').dropna().index) - - active_range = (srs.index >= start_date) & (srs.index <= end_date) - srs = srs[active_range].fillna(0.) - - tmp = pd.DataFrame({'power_usage': srs}) - date = tmp.index - tmp['t'] = (date - earliest_time).seconds / 60 / 60 + ( - date - earliest_time).days * 24 - tmp['days_from_start'] = (date - earliest_time).days - tmp['categorical_id'] = label - tmp['date'] = date - tmp['id'] = label - tmp['hour'] = date.hour - tmp['day'] = date.day - tmp['day_of_week'] = date.dayofweek - tmp['month'] = date.month - - df_list.append(tmp) - - output = pd.concat(df_list, axis=0, join='outer').reset_index(drop=True) - - output['categorical_id'] = output['id'].copy() - output['hours_from_start'] = output['t'] - output['categorical_day_of_week'] = output['day_of_week'].copy() - output['categorical_hour'] = output['hour'].copy() - - # Filter to match range used by other academic papers - output = output[(output['days_from_start'] >= 1096) - & (output['days_from_start'] < 1346)].copy() - - output.to_csv(config.data_csv_path) - - print('Done.') - - -def download_traffic(config): - """Downloads traffic dataset from UCI repository.""" - - url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00204/PEMS-SF.zip' - - data_folder = config.data_folder - csv_path = os.path.join(data_folder, 'PEMS_train') - zip_path = os.path.join(data_folder, 'PEMS-SF.zip') - - download_and_unzip(url, zip_path, csv_path, data_folder) - - print('Aggregating to hourly data') - - def process_list(s, variable_type=int, delimiter=None): - """Parses a line in the PEMS format to a list.""" - if delimiter is None: - l = [ - variable_type(i) for i in s.replace('[', '').replace(']', '').split() - ] - else: - l = [ - variable_type(i) - for i in s.replace('[', '').replace(']', '').split(delimiter) - ] - - return l - - def read_single_list(filename): - """Returns single list from a file in the PEMS-custom format.""" - with open(os.path.join(data_folder, filename), 'r') as dat: - l = process_list(dat.readlines()[0]) - return l - - def read_matrix(filename): - """Returns a matrix from a file in the PEMS-custom format.""" - array_list = [] - with open(os.path.join(data_folder, filename), 'r') as dat: - - lines = dat.readlines() - for i, line in enumerate(lines): - if (i + 1) % 50 == 0: - print('Completed {} of {} rows for {}'.format(i + 1, len(lines), - filename)) - - array = [ - process_list(row_split, variable_type=float, delimiter=None) - for row_split in process_list( - line, variable_type=str, delimiter=';') - ] - array_list.append(array) - - return array_list - - shuffle_order = np.array(read_single_list('randperm')) - 1 # index from 0 - train_dayofweek = read_single_list('PEMS_trainlabels') - train_tensor = read_matrix('PEMS_train') - test_dayofweek = read_single_list('PEMS_testlabels') - test_tensor = read_matrix('PEMS_test') - - # Inverse permutate shuffle order - print('Shuffling') - inverse_mapping = { - new_location: previous_location - for previous_location, new_location in enumerate(shuffle_order) - } - reverse_shuffle_order = np.array([ - inverse_mapping[new_location] - for new_location, _ in enumerate(shuffle_order) - ]) - - # Group and reoder based on permuation matrix - print('Reodering') - day_of_week = np.array(train_dayofweek + test_dayofweek) - combined_tensor = np.array(train_tensor + test_tensor) - - day_of_week = day_of_week[reverse_shuffle_order] - combined_tensor = combined_tensor[reverse_shuffle_order] - - # Put everything back into a dataframe - print('Parsing as dataframe') - labels = ['traj_{}'.format(i) for i in read_single_list('stations_list')] - - hourly_list = [] - for day, day_matrix in enumerate(combined_tensor): - - # Hourly data - hourly = pd.DataFrame(day_matrix.T, columns=labels) - hourly['hour_on_day'] = [int(i / 6) for i in hourly.index - ] # sampled at 10 min intervals - if hourly['hour_on_day'].max() > 23 or hourly['hour_on_day'].min() < 0: - raise ValueError('Invalid hour! {}-{}'.format( - hourly['hour_on_day'].min(), hourly['hour_on_day'].max())) - - hourly = hourly.groupby('hour_on_day', as_index=True).mean()[labels] - hourly['sensor_day'] = day - hourly['time_on_day'] = hourly.index - hourly['day_of_week'] = day_of_week[day] - - hourly_list.append(hourly) - - hourly_frame = pd.concat(hourly_list, axis=0, ignore_index=True, sort=False) - - # Flatten such that each entitiy uses one row in dataframe - store_columns = [c for c in hourly_frame.columns if 'traj' in c] - other_columns = [c for c in hourly_frame.columns if 'traj' not in c] - flat_df = pd.DataFrame(columns=['values', 'prev_values', 'next_values'] + - other_columns + ['id']) - - def format_index_string(x): - """Returns formatted string for key.""" - - if x < 10: - return '00' + str(x) - elif x < 100: - return '0' + str(x) - elif x < 1000: - return str(x) - - raise ValueError('Invalid value of x {}'.format(x)) - - for store in store_columns: - print('Processing {}'.format(store)) - - sliced = hourly_frame[[store] + other_columns].copy() - sliced.columns = ['values'] + other_columns - sliced['id'] = int(store.replace('traj_', '')) - - # Sort by Sensor-date-time - key = sliced['id'].apply(str) \ - + sliced['sensor_day'].apply(lambda x: '_' + format_index_string(x)) \ - + sliced['time_on_day'].apply(lambda x: '_' + format_index_string(x)) - sliced = sliced.set_index(key).sort_index() - - sliced['values'] = sliced['values'].fillna(method='ffill') - sliced['prev_values'] = sliced['values'].shift(1) - sliced['next_values'] = sliced['values'].shift(-1) - - flat_df = pd.concat([flat_df, sliced.dropna()], ignore_index=True, sort=False) - - # Filter to match range used by other academic papers - index = flat_df['sensor_day'] - flat_df = flat_df[index < 173].copy() - - # Creating columns fo categorical inputs - flat_df['categorical_id'] = flat_df['id'].copy() - flat_df['hours_from_start'] = flat_df['time_on_day'] \ - + flat_df['sensor_day']*24. - flat_df['categorical_day_of_week'] = flat_df['day_of_week'].copy() - flat_df['categorical_time_on_day'] = flat_df['time_on_day'].copy() - - flat_df.to_csv(config.data_csv_path) - print('Done.') - - -def process_favorita(config): - """Processes Favorita dataset. - - Makes use of the raw files should be manually downloaded from Kaggle @ - https://www.kaggle.com/c/favorita-grocery-sales-forecasting/data - - Args: - config: Default experiment config for Favorita - """ - - url = 'https://www.kaggle.com/c/favorita-grocery-sales-forecasting/data' - - data_folder = config.data_folder - - # Save manual download to root folder to avoid deleting when re-processing. - zip_file = os.path.join(data_folder, '..', - 'favorita-grocery-sales-forecasting.zip') - - if not os.path.exists(zip_file): - raise ValueError( - 'Favorita zip file not found in {}!'.format(zip_file) + - ' Please manually download data from Kaggle @ {}'.format(url)) - - # Unpack main zip file - outputs_file = os.path.join(data_folder, 'train.csv.7z') - unzip(zip_file, outputs_file, data_folder) - - # Unpack individually zipped files - for file in glob.glob(os.path.join(data_folder, '*.7z')): - - csv_file = file.replace('.7z', '') - - unzip(file, csv_file, data_folder) - - print('Unzipping complete, commencing data processing...') - - # Extract only a subset of data to save/process for efficiency - start_date = pd.datetime(2015, 1, 1) - end_date = pd.datetime(2016, 6, 1) - - print('Regenerating data...') - - # load temporal data - temporal = pd.read_csv(os.path.join(data_folder, 'train.csv'), index_col=0) - - store_info = pd.read_csv(os.path.join(data_folder, 'stores.csv'), index_col=0) - oil = pd.read_csv( - os.path.join(data_folder, 'oil.csv'), index_col=0).iloc[:, 0] - holidays = pd.read_csv(os.path.join(data_folder, 'holidays_events.csv')) - items = pd.read_csv(os.path.join(data_folder, 'items.csv'), index_col=0) - transactions = pd.read_csv(os.path.join(data_folder, 'transactions.csv')) - - # Take first 6 months of data - temporal['date'] = pd.to_datetime(temporal['date']) - - # Filter dates to reduce storage space requirements - if start_date is not None: - temporal = temporal[(temporal['date'] >= start_date)] - if end_date is not None: - temporal = temporal[(temporal['date'] < end_date)] - - dates = temporal['date'].unique() - - # Add trajectory identifier - temporal['traj_id'] = temporal['store_nbr'].apply( - str) + '_' + temporal['item_nbr'].apply(str) - temporal['unique_id'] = temporal['traj_id'] + '_' + temporal['date'].apply( - str) - - # Remove all IDs with negative returns - print('Removing returns data') - min_returns = temporal['unit_sales'].groupby(temporal['traj_id']).min() - valid_ids = set(min_returns[min_returns >= 0].index) - selector = temporal['traj_id'].apply(lambda traj_id: traj_id in valid_ids) - new_temporal = temporal[selector].copy() - del temporal - gc.collect() - temporal = new_temporal - temporal['open'] = 1 - - # Resampling - print('Resampling to regular grid') - resampled_dfs = [] - for traj_id, raw_sub_df in temporal.groupby('traj_id'): - print('Resampling', traj_id) - sub_df = raw_sub_df.set_index('date', drop=True).copy() - sub_df = sub_df.resample('1d').last() - sub_df['date'] = sub_df.index - sub_df[['store_nbr', 'item_nbr', 'onpromotion']] \ - = sub_df[['store_nbr', 'item_nbr', 'onpromotion']].fillna(method='ffill') - sub_df['open'] = sub_df['open'].fillna( - 0) # flag where sales data is unknown - sub_df['log_sales'] = np.log(sub_df['unit_sales']) - - resampled_dfs.append(sub_df.reset_index(drop=True)) - - new_temporal = pd.concat(resampled_dfs, axis=0) - del temporal - gc.collect() - temporal = new_temporal - - print('Adding oil') - oil.name = 'oil' - oil.index = pd.to_datetime(oil.index) - temporal = temporal.join( - oil.loc[dates].fillna(method='ffill'), on='date', how='left') - temporal['oil'] = temporal['oil'].fillna(-1) - - print('Adding store info') - temporal = temporal.join(store_info, on='store_nbr', how='left') - - print('Adding item info') - temporal = temporal.join(items, on='item_nbr', how='left') - - transactions['date'] = pd.to_datetime(transactions['date']) - temporal = temporal.merge( - transactions, - left_on=['date', 'store_nbr'], - right_on=['date', 'store_nbr'], - how='left') - temporal['transactions'] = temporal['transactions'].fillna(-1) - - # Additional date info - temporal['day_of_week'] = pd.to_datetime(temporal['date'].values).dayofweek - temporal['day_of_month'] = pd.to_datetime(temporal['date'].values).day - temporal['month'] = pd.to_datetime(temporal['date'].values).month - - # Add holiday info - print('Adding holidays') - holiday_subset = holidays[holidays['transferred'].apply( - lambda x: not x)].copy() - holiday_subset.columns = [ - s if s != 'type' else 'holiday_type' for s in holiday_subset.columns - ] - holiday_subset['date'] = pd.to_datetime(holiday_subset['date']) - local_holidays = holiday_subset[holiday_subset['locale'] == 'Local'] - regional_holidays = holiday_subset[holiday_subset['locale'] == 'Regional'] - national_holidays = holiday_subset[holiday_subset['locale'] == 'National'] - - temporal['national_hol'] = temporal.merge( - national_holidays, left_on=['date'], right_on=['date'], - how='left')['description'].fillna('') - temporal['regional_hol'] = temporal.merge( - regional_holidays, - left_on=['state', 'date'], - right_on=['locale_name', 'date'], - how='left')['description'].fillna('') - temporal['local_hol'] = temporal.merge( - local_holidays, - left_on=['city', 'date'], - right_on=['locale_name', 'date'], - how='left')['description'].fillna('') - - temporal.sort_values('unique_id', inplace=True) - - print('Saving processed file to {}'.format(config.data_csv_path)) - temporal.to_csv(config.data_csv_path) - - -# Core routine. -def main(expt_name, force_download, output_folder): - """Runs main download routine. - - Args: - expt_name: Name of experiment - force_download: Whether to force data download from scratch - output_folder: Folder path for storing data - """ - - print('#### Running download script ###') - - expt_config = ExperimentConfig(expt_name, output_folder) - - if os.path.exists(expt_config.data_csv_path) and not force_download: - print('Data has been processed for {}. Skipping download...'.format( - expt_name)) - sys.exit(0) - else: - print('Resetting data folder...') - recreate_folder(expt_config.data_folder) - - # Default download functions - download_functions = { - 'volatility': download_volatility, - 'electricity': download_electricity, - 'traffic': download_traffic, - 'favorita': process_favorita - } - - if expt_name not in download_functions: - raise ValueError('Unrecongised experiment! name={}'.format(expt_name)) - - download_function = download_functions[expt_name] - - # Run data download - print('Getting {} data...'.format(expt_name)) - download_function(expt_config) - - print('Download completed.') - - -if __name__ == '__main__': - - def get_args(): - """Returns settings from command line.""" - - experiment_names = ExperimentConfig.default_experiments - - parser = argparse.ArgumentParser(description='Data download configs') - parser.add_argument( - 'expt_name', - metavar='e', - type=str, - nargs='?', - choices=experiment_names, - help='Experiment Name. Default={}'.format(','.join(experiment_names))) - parser.add_argument( - 'output_folder', - metavar='f', - type=str, - nargs='?', - default='.', - help='Path to folder for data download') - parser.add_argument( - 'force_download', - metavar='r', - type=str, - nargs='?', - choices=['yes', 'no'], - default='no', - help='Whether to re-run data download') - - args = parser.parse_known_args()[0] - print(args) - - root_folder = None if args.output_folder == '.' else args.output_folder - - return args.expt_name, args.force_download == 'yes', root_folder - - name, force, folder = get_args() - main(expt_name=name, force_download=force, output_folder=folder) diff --git a/exp/exp_classification.py b/exp/exp_classification.py index 3be2173..0a9582c 100644 --- a/exp/exp_classification.py +++ b/exp/exp_classification.py @@ -19,8 +19,8 @@ def __init__(self, args): def _build_model(self): # model input depends on data - train_data, train_loader = self._get_data(flag='TRAIN') - test_data, test_loader = self._get_data(flag='TEST') + train_data, _ = self._get_data(flag='train') + test_data, _ = self._get_data(flag='test') self.args.seq_len = max(train_data.max_seq_len, test_data.max_seq_len) self.args.pred_len = 0 self.args.enc_in = train_data.feature_df.shape[1] @@ -43,7 +43,7 @@ def _select_criterion(self): criterion = nn.CrossEntropyLoss() return criterion - def vali(self, vali_data, vali_loader, criterion): + def vali(self, vali_loader, criterion): total_loss = [] preds = [] trues = [] @@ -76,9 +76,9 @@ def vali(self, vali_data, vali_loader, criterion): return total_loss, accuracy def train(self, setting): - train_data, train_loader = self._get_data(flag='TRAIN') - vali_data, vali_loader = self._get_data(flag='TEST') - test_data, test_loader = self._get_data(flag='TEST') + _, train_loader = self._get_data(flag='train') + _, vali_loader = self._get_data(flag='val') + _, test_loader = self._get_data(flag='test') path = os.path.join(self.args.checkpoints, setting) if not os.path.exists(path): @@ -125,8 +125,8 @@ def train(self, setting): print("Epoch: {} cost time: {}".format(epoch + 1, time.time() - epoch_time)) train_loss = np.average(train_loss) - vali_loss, val_accuracy = self.vali(vali_data, vali_loader, criterion) - test_loss, test_accuracy = self.vali(test_data, test_loader, criterion) + vali_loss, val_accuracy = self.vali(vali_loader, criterion) + test_loss, test_accuracy = self.vali(test_loader, criterion) print( "Epoch: {0}, Steps: {1} | Train Loss: {2:.3f} Vali Loss: {3:.3f} Vali Acc: {4:.3f} Test Loss: {5:.3f} Test Acc: {6:.3f}" @@ -144,7 +144,7 @@ def train(self, setting): return self.model def test(self, setting, test=0): - test_data, test_loader = self._get_data(flag='TEST') + test_data, test_loader = self._get_data(flag='test') if test: print('loading model') self.model.load_state_dict(torch.load(os.path.join('./checkpoints/' + setting, 'checkpoint.pth'))) diff --git a/exp/exp_long_term_forecasting.py b/exp/exp_long_term_forecasting.py index aa9cde4..1415a76 100644 --- a/exp/exp_long_term_forecasting.py +++ b/exp/exp_long_term_forecasting.py @@ -36,7 +36,7 @@ def _select_criterion(self): criterion = nn.MSELoss() return criterion - def vali(self, vali_data, vali_loader, criterion): + def vali(self, vali_loader, criterion): total_loss = [] self.model.eval() with torch.no_grad(): @@ -71,15 +71,21 @@ def vali(self, vali_data, vali_loader, criterion): loss = criterion(pred, true) - total_loss.append(loss) - total_loss = np.average(total_loss) + total_loss.append(loss.item()) + + if len(total_loss) == 0: + print('Warning: no loss values found.') + total_loss = np.inf + else: + total_loss = np.average(total_loss) + self.model.train() return total_loss def train(self, setting): - train_data, train_loader = self._get_data(flag='train') - vali_data, vali_loader = self._get_data(flag='val') - test_data, test_loader = self._get_data(flag='test') + _, train_loader = self._get_data(flag='train') + _, vali_loader = self._get_data(flag='val') + _, test_loader = self._get_data(flag='test') path = os.path.join(self.args.checkpoints, setting) if not os.path.exists(path): @@ -158,8 +164,8 @@ def train(self, setting): print("Epoch: {} cost time: {}".format(epoch + 1, time.time() - epoch_time)) train_loss = np.average(train_loss) - vali_loss = self.vali(vali_data, vali_loader, criterion) - test_loss = self.vali(test_data, test_loader, criterion) + vali_loss = self.vali(vali_loader, criterion) + test_loss = self.vali(test_loader, criterion) print("Epoch: {0}, Steps: {1} | Train Loss: {2:.7f} Vali Loss: {3:.7f} Test Loss: {4:.7f}".format( epoch + 1, train_steps, train_loss, vali_loss, test_loss)) @@ -170,13 +176,14 @@ def train(self, setting): adjust_learning_rate(model_optim, epoch + 1, self.args) + print('Loading the best model') best_model_path = path + '/' + 'checkpoint.pth' self.model.load_state_dict(torch.load(best_model_path)) return self.model def test(self, setting, test=0): - test_data, test_loader = self._get_data(flag='test') + _, test_loader = self._get_data(flag='test') if test: print('loading model') self.model.load_state_dict(torch.load(os.path.join('./checkpoints/' + setting, 'checkpoint.pth'))) @@ -219,26 +226,22 @@ def test(self, setting, test=0): outputs = outputs.detach().cpu().numpy() batch_y = batch_y.detach().cpu().numpy() - pred = outputs - true = batch_y - - preds.append(pred) - trues.append(true) + preds.append(outputs) + trues.append(batch_y) if i % 20 == 0: input = batch_x.detach().cpu().numpy() - gt = np.concatenate((input[0, :, -1], true[0, :, -1]), axis=0) - pd = np.concatenate((input[0, :, -1], pred[0, :, -1]), axis=0) + gt = np.concatenate((input[0, :, -1], batch_y[0, :, -1]), axis=0) + pd = np.concatenate((input[0, :, -1], outputs[0, :, -1]), axis=0) visual(gt, pd, os.path.join(folder_path, str(i) + '.pdf')) # this line handles different size of batch. E.g. last batch can be < batch_size. preds = np.concatenate(preds, axis=0) trues = np.concatenate(trues, axis=0) - print('test shape:', preds.shape, trues.shape) - preds = preds.reshape((-1, preds.shape[-2], preds.shape[-1])) - trues = trues.reshape((-1, trues.shape[-2], trues.shape[-1])) - print('test shape:', preds.shape, trues.shape) + preds = preds.reshape((-1, *preds.shape[-2:])) + trues = trues.reshape((-1, *trues.shape[-2:])) + print('Preds and Trues shape:', preds.shape, trues.shape) # result save folder_path = './results/' + setting + '/' @@ -246,12 +249,10 @@ def test(self, setting, test=0): os.makedirs(folder_path) mae, mse, rmse, mape, mspe = metric(preds, trues) - print('mse:{}, mae:{}'.format(mse, mae)) + result_string = f'mse:{mse:0.5g}, mae:{mae:0.5g}, rmse: {rmse:0.5g}, mape: {mape:0.5g}, mspe :{mspe:0.5g}.' + print(result_string) f = open("result_long_term_forecast.txt", 'a') - f.write(setting + " \n") - f.write('mse:{}, mae:{}'.format(mse, mae)) - f.write('\n') - f.write('\n') + f.write(setting + " \n" + result_string + '\n\n') f.close() np.save(folder_path + 'metrics.npy', np.array([mae, mse, rmse, mape, mspe])) diff --git a/exp/exp_main.py b/exp/exp_main.py deleted file mode 100644 index 316e818..0000000 --- a/exp/exp_main.py +++ /dev/null @@ -1,318 +0,0 @@ -from data_provider.data_factory import data_provider -from exp.exp_basic import Exp_Basic -from models import Transformer -from utils.tools import EarlyStopping, adjust_learning_rate, visual -from utils.metrics import metric - -import numpy as np -import torch -import torch.nn as nn -from torch import optim - -import os -import time - -import warnings -import matplotlib.pyplot as plt -import numpy as np - -warnings.filterwarnings('ignore') - - -class Exp_Main(Exp_Basic): - def __init__(self, args): - super(Exp_Main, self).__init__(args) - - def _build_model(self): - model_dict = { - # 'Autoformer': Autoformer, - 'Transformer': Transformer, - # 'Informer': Informer, - # 'Reformer': Reformer, - } - model = model_dict[self.args.model].Model(self.args).float() - - if self.args.use_multi_gpu and self.args.use_gpu: - model = nn.DataParallel(model, device_ids=self.args.device_ids) - return model - - def _get_data(self, flag): - data_set, data_loader = data_provider(self.args, flag) - return data_set, data_loader - - def _select_optimizer(self): - model_optim = optim.Adam(self.model.parameters(), lr=self.args.learning_rate) - return model_optim - - def _select_criterion(self): - criterion = nn.MSELoss() - return criterion - - def vali(self, vali_data, vali_loader, criterion): - total_loss = [] - self.model.eval() - with torch.no_grad(): - for i, (batch_x, batch_y, batch_x_mark, batch_y_mark) in enumerate(vali_loader): - batch_x = batch_x.float().to(self.device) - batch_y = batch_y.float() - - batch_x_mark = batch_x_mark.float().to(self.device) - batch_y_mark = batch_y_mark.float().to(self.device) - - # decoder input - dec_inp = torch.zeros_like(batch_y[:, -self.args.pred_len:, :]).float() - dec_inp = torch.cat([batch_y[:, :self.args.label_len, :], dec_inp], dim=1).float().to(self.device) - # encoder - decoder - if self.args.use_amp: - with torch.cuda.amp.autocast(): - if self.args.output_attention: - outputs = self.model(batch_x, batch_x_mark, dec_inp, batch_y_mark)[0] - else: - outputs = self.model(batch_x, batch_x_mark, dec_inp, batch_y_mark) - else: - if self.args.output_attention: - outputs = self.model(batch_x, batch_x_mark, dec_inp, batch_y_mark)[0] - else: - outputs = self.model(batch_x, batch_x_mark, dec_inp, batch_y_mark) - f_dim = -1 if self.args.features == 'MS' else 0 - outputs = outputs[:, -self.args.pred_len:, f_dim:] - batch_y = batch_y[:, -self.args.pred_len:, f_dim:].to(self.device) - - pred = outputs.detach().cpu() - true = batch_y.detach().cpu() - - loss = criterion(pred, true) - - total_loss.append(loss) - total_loss = np.average(total_loss) - self.model.train() - return total_loss - - def train(self, setting): - train_data, train_loader = self._get_data(flag='train') - vali_data, vali_loader = self._get_data(flag='val') - test_data, test_loader = self._get_data(flag='test') - - path = os.path.join(self.args.checkpoints, setting) - if not os.path.exists(path): - os.makedirs(path) - - time_now = time.time() - - train_steps = len(train_loader) - early_stopping = EarlyStopping(patience=self.args.patience, verbose=True) - - model_optim = self._select_optimizer() - criterion = self._select_criterion() - - if self.args.use_amp: - scaler = torch.cuda.amp.GradScaler() - - for epoch in range(self.args.train_epochs): - iter_count = 0 - train_loss = [] - - self.model.train() - epoch_time = time.time() - for i, (batch_x, batch_y, batch_x_mark, batch_y_mark) in enumerate(train_loader): - iter_count += 1 - model_optim.zero_grad() - batch_x = batch_x.float().to(self.device) - - batch_y = batch_y.float().to(self.device) - batch_x_mark = batch_x_mark.float().to(self.device) - batch_y_mark = batch_y_mark.float().to(self.device) - - # decoder input - dec_inp = torch.zeros_like(batch_y[:, -self.args.pred_len:, :]).float() - dec_inp = torch.cat([batch_y[:, :self.args.label_len, :], dec_inp], dim=1).float().to(self.device) - - # encoder - decoder - if self.args.use_amp: - with torch.cuda.amp.autocast(): - if self.args.output_attention: - outputs = self.model(batch_x, batch_x_mark, dec_inp, batch_y_mark)[0] - else: - outputs = self.model(batch_x, batch_x_mark, dec_inp, batch_y_mark) - - f_dim = -1 if self.args.features == 'MS' else 0 - outputs = outputs[:, -self.args.pred_len:, f_dim:] - batch_y = batch_y[:, -self.args.pred_len:, f_dim:].to(self.device) - loss = criterion(outputs, batch_y) - train_loss.append(loss.item()) - else: - if self.args.output_attention: - outputs = self.model(batch_x, batch_x_mark, dec_inp, batch_y_mark)[0] - else: - outputs = self.model(batch_x, batch_x_mark, dec_inp, batch_y_mark) - - f_dim = -1 if self.args.features == 'MS' else 0 - outputs = outputs[:, -self.args.pred_len:, f_dim:] - batch_y = batch_y[:, -self.args.pred_len:, f_dim:].to(self.device) - loss = criterion(outputs, batch_y) - train_loss.append(loss.item()) - - if (i + 1) % 100 == 0: - print("\titers: {0}, epoch: {1} | loss: {2:.7f}".format(i + 1, epoch + 1, loss.item())) - speed = (time.time() - time_now) / iter_count - left_time = speed * ((self.args.train_epochs - epoch) * train_steps - i) - print('\tspeed: {:.4f}s/iter; left time: {:.4f}s'.format(speed, left_time)) - iter_count = 0 - time_now = time.time() - - if self.args.use_amp: - scaler.scale(loss).backward() - scaler.step(model_optim) - scaler.update() - else: - loss.backward() - model_optim.step() - - print("Epoch: {} cost time: {}".format(epoch + 1, time.time() - epoch_time)) - train_loss = np.average(train_loss) - vali_loss = self.vali(vali_data, vali_loader, criterion) - test_loss = self.vali(test_data, test_loader, criterion) - - print("Epoch: {0}, Steps: {1} | Train Loss: {2:.7f} Vali Loss: {3:.7f} Test Loss: {4:.7f}".format( - epoch + 1, train_steps, train_loss, vali_loss, test_loss)) - early_stopping(vali_loss, self.model, path) - if early_stopping.early_stop: - print("Early stopping") - break - - adjust_learning_rate(model_optim, epoch + 1, self.args) - - best_model_path = path + '/' + 'checkpoint.pth' - self.model.load_state_dict(torch.load(best_model_path)) - - return - - def test(self, setting, test=0): - test_data, test_loader = self._get_data(flag='test') - if test: - print('loading model') - self.model.load_state_dict(torch.load(os.path.join('./checkpoints/' + setting, 'checkpoint.pth'))) - - preds = [] - trues = [] - folder_path = './test_results/' + setting + '/' - if not os.path.exists(folder_path): - os.makedirs(folder_path) - - self.model.eval() - with torch.no_grad(): - for i, (batch_x, batch_y, batch_x_mark, batch_y_mark) in enumerate(test_loader): - batch_x = batch_x.float().to(self.device) - batch_y = batch_y.float().to(self.device) - - batch_x_mark = batch_x_mark.float().to(self.device) - batch_y_mark = batch_y_mark.float().to(self.device) - - # decoder input - dec_inp = torch.zeros_like(batch_y[:, -self.args.pred_len:, :]).float() - dec_inp = torch.cat([batch_y[:, :self.args.label_len, :], dec_inp], dim=1).float().to(self.device) - # encoder - decoder - if self.args.use_amp: - with torch.cuda.amp.autocast(): - if self.args.output_attention: - outputs = self.model(batch_x, batch_x_mark, dec_inp, batch_y_mark)[0] - else: - outputs = self.model(batch_x, batch_x_mark, dec_inp, batch_y_mark) - else: - if self.args.output_attention: - outputs = self.model(batch_x, batch_x_mark, dec_inp, batch_y_mark)[0] - - else: - outputs = self.model(batch_x, batch_x_mark, dec_inp, batch_y_mark) - - f_dim = -1 if self.args.features == 'MS' else 0 - outputs = outputs[:, -self.args.pred_len:, f_dim:] - batch_y = batch_y[:, -self.args.pred_len:, f_dim:].to(self.device) - outputs = outputs.detach().cpu().numpy() - batch_y = batch_y.detach().cpu().numpy() - - pred = outputs # outputs.detach().cpu().numpy() # .squeeze() - true = batch_y # batch_y.detach().cpu().numpy() # .squeeze() - - preds.append(pred) - trues.append(true) - if i % 20 == 0: - input = batch_x.detach().cpu().numpy() - gt = np.concatenate((input[0, :, -1], true[0, :, -1]), axis=0) - pd = np.concatenate((input[0, :, -1], pred[0, :, -1]), axis=0) - visual(gt, pd, os.path.join(folder_path, str(i) + '.pdf')) - - preds = np.concatenate(preds, axis=0) - trues = np.concatenate(trues, axis=0) - print('test shape:', preds.shape, trues.shape) - preds = preds.reshape(-1, preds.shape[-2], preds.shape[-1]) - trues = trues.reshape(-1, trues.shape[-2], trues.shape[-1]) - print('test shape:', preds.shape, trues.shape) - - # result save - folder_path = './results/' + setting + '/' - if not os.path.exists(folder_path): - os.makedirs(folder_path) - - mae, mse, rmse, mape, mspe = metric(preds, trues) - print('mse:{}, mae:{}'.format(mse, mae)) - f = open("result.txt", 'a') - f.write(setting + " \n") - f.write('mse:{}, mae:{}'.format(mse, mae)) - f.write('\n') - f.write('\n') - f.close() - - np.save(folder_path + 'metrics.npy', np.array([mae, mse, rmse, mape, mspe])) - np.save(folder_path + 'pred.npy', preds) - np.save(folder_path + 'true.npy', trues) - - return - - def predict(self, setting, load=False): - pred_data, pred_loader = self._get_data(flag='pred') - - if load: - path = os.path.join(self.args.checkpoints, setting) - best_model_path = path + '/' + 'checkpoint.pth' - self.model.load_state_dict(torch.load(best_model_path)) - - preds = [] - - self.model.eval() - with torch.no_grad(): - for i, (batch_x, batch_y, batch_x_mark, batch_y_mark) in enumerate(pred_loader): - batch_x = batch_x.float().to(self.device) - batch_y = batch_y.float() - batch_x_mark = batch_x_mark.float().to(self.device) - batch_y_mark = batch_y_mark.float().to(self.device) - - # decoder input - dec_inp = torch.zeros([batch_y.shape[0], self.args.pred_len, batch_y.shape[2]]).float() - dec_inp = torch.cat([batch_y[:, :self.args.label_len, :], dec_inp], dim=1).float().to(self.device) - # encoder - decoder - if self.args.use_amp: - with torch.cuda.amp.autocast(): - if self.args.output_attention: - outputs = self.model(batch_x, batch_x_mark, dec_inp, batch_y_mark)[0] - else: - outputs = self.model(batch_x, batch_x_mark, dec_inp, batch_y_mark) - else: - if self.args.output_attention: - outputs = self.model(batch_x, batch_x_mark, dec_inp, batch_y_mark)[0] - else: - outputs = self.model(batch_x, batch_x_mark, dec_inp, batch_y_mark) - pred = outputs.detach().cpu().numpy() # .squeeze() - preds.append(pred) - - preds = np.array(preds) - preds = preds.reshape(-1, preds.shape[-2], preds.shape[-1]) - - # result save - folder_path = './results/' + setting + '/' - if not os.path.exists(folder_path): - os.makedirs(folder_path) - - np.save(folder_path + 'real_prediction.npy', preds) - - return diff --git a/models/Transformer.py b/models/Transformer.py index c10a908..512bcc3 100644 --- a/models/Transformer.py +++ b/models/Transformer.py @@ -91,7 +91,7 @@ def classification(self, x_enc, x_mark_enc): return output def forward(self, x_enc, x_mark_enc, x_dec, x_mark_dec, mask=None): - if self.task_name == 'long_term_forecast' or self.task_name == 'short_term_forecast': + if self.task_name == 'long_term_forecast': dec_out = self.forecast(x_enc, x_mark_enc, x_dec, x_mark_dec) return dec_out[:, -self.pred_len:, :] # [B, L, D] if self.task_name == 'classification': diff --git a/result_long_term_forecast.txt b/result_long_term_forecast.txt index 3ad3d59..0af9607 100644 --- a/result_long_term_forecast.txt +++ b/result_long_term_forecast.txt @@ -4,3 +4,15 @@ mse:4.545623302459717, mae:1.408710241317749 long_term_forecast_ili_36_24_DLinear_custom_ftM_sl36_ll18_pl24_dm512_nh8_el2_dl1_df2048_fc3_ebtimeF_dtTrue_Exp_0 mse:4.793543338775635, mae:1.6582401990890503 +long_term_forecast_covid_14_14_Transformer_covid_ftMS_sl14_ll7_pl14_dm512_nh8_el2_dl1_df2048_fc3_ebtimeF_dtTrue_Exp_0 +mse:2.168884038925171, mae:0.8049907088279724 + +long_term_forecast_covid_14_14_Transformer_covid_ftMS_sl14_ll7_pl14_dm512_nh8_el2_dl1_df2048_fc3_ebtimeF_dtTrue_Exp_0 +mse:5346124.0, mae:1320.0423583984375 + +long_term_forecast_covid_14_14_Transformer_covid_ftMS_sl14_ll7_pl14_dm512_nh8_el2_dl1_df2048_fc3_ebtimeF_dtTrue_Exp_0 +mse:2.1817, mae:0.8029, rmse: 1.4771, mape: 1.9375, mspe :53.245. + +long_term_forecast_covid_14_14_Transformer_covid_ftMS_sl14_ll7_pl14_dm512_nh8_el2_dl1_df2048_fc3_ebtimeF_dtTrue_Exp_0 +mse:2.1817, mae:0.8029, rmse: 1.4771, mape: 1.9375, mspe :53.245. + diff --git a/results/long_term_forecast_covid_14_14_Transformer_covid_ftMS_sl14_ll7_pl14_dm512_nh8_el2_dl1_df2048_fc3_ebtimeF_dtTrue_Exp_0/metrics.npy b/results/long_term_forecast_covid_14_14_Transformer_covid_ftMS_sl14_ll7_pl14_dm512_nh8_el2_dl1_df2048_fc3_ebtimeF_dtTrue_Exp_0/metrics.npy new file mode 100644 index 0000000000000000000000000000000000000000..4efb30e994570fcb62bb69260d351251f1e41f0d GIT binary patch literal 148 zcmbR27wQ`j$;eQ~P_3SlTAW;@Zl$1ZlV+l>qoAIaUsO_*m=~X4l#&V(cT3DEP6dh= qXCxM+0{I%II+{8PwF(pfF2-(O`y&gv9gYg@wKrz?VgKe=h!X(E<0IPu literal 0 HcmV?d00001 diff --git a/results/long_term_forecast_covid_14_14_Transformer_covid_ftMS_sl14_ll7_pl14_dm512_nh8_el2_dl1_df2048_fc3_ebtimeF_dtTrue_Exp_0/pred.npy b/results/long_term_forecast_covid_14_14_Transformer_covid_ftMS_sl14_ll7_pl14_dm512_nh8_el2_dl1_df2048_fc3_ebtimeF_dtTrue_Exp_0/pred.npy new file mode 100644 index 0000000000000000000000000000000000000000..8f257062f4f04ec47a4eac719fad4f189a36b0b0 GIT binary patch literal 1248 zcmbV=|3B1s0LLqigek{vvgU-xxtt{>%6;DNE8ALxuIQ234WUpHclDTT^6ehgBWJbi zp;E&s-%ja!9)0<|-XxXj8@5Wt&ZDVoeVLT0JgT*SVb90o`NQ+6kBAOm95Z&zhA}yk zq-1qsmSnD2B40U6A`?qiW@Kry64u6LWF;m4_rFS5qfY+t)hP+1)R`pIA%wsM~^d4p}kl zo3>0pQOE?5zW8&4t70@upqMfH0v+BKPbG~%k(T-tyohvSs%W92v+E^YHNBY{b2I4E z!FKY)qphgEb(hozhzW8pb4s?4WPiVaj_SSX!oFdm_q$FGrFL_tOp`eO4k=O2bf@V9 z?}(#QFB!d43SC||e6lZMf7=>F2EFGWUYrWSxz*S-eHX%{21JfG!1h%Hysuy41tacQ zX1yFc|4<^NECSe@ z31r7Jf2{CKK#N^GqLoHoR5+KER2Gmed5)ymJdw9pBEyh(FmAb4@_RxlNgUWs&f988 zU9O1)D7{HRkSnk2ddnYp?||I3?fh*`zPwnIN`h-$i8Qj1_t4KmNz)Pp@7jzxR%ZV6 zrpGv*atWQD%{Y7F2ztX0z?^pwn^&|VJ+K#7TC4EvssY=>_rTe*1Qm_j5fgqGY@a6+ zUYo+as$G~<`^W4|i!pHyRwS8Ud&;p4XYTmVI}kJSoJLj*0x|0tEzEgZBk!$ zZ@P@VdM;tQFfVp_w+CD4BWBLirZSfcQYNYSgb7zqW{+-9VcI}+H{tx%Z||(+E!XcTIf)@feMer(Gk0gw9)Mp9T&5oUQcVJem=*j*I*eveEMf( z`mUpmrD@dAL8-&)b!2zLa_qe_hJ~nn*pAF3npD4(_ItY1#E14I_kK3|eiO12uX=#N zDiS&)mo(*=xF;X|!mqa~M%PXgGBoxqKem<3%i2#e@6016d6j(O%W%kzJ*cmn22V7d zJDAolkFx!Y)Yt@&BVVL)XS`DR0Hp=~`Af|i^4hsdT@r~|oKHGm|IYPS-r!$HT0*LL z!9QzN@IK`Zq~?Yj8MGyGzvhSh)xkj2Ey_7($@R!*H;0k##cC3pP{0M7&-4D5Mi4Sm zi04*Cm=ddn*_aRI10AlNJc{AJ@1e#Rg^E>Cm^iQ;pJNfMA1I-E&;VuJb7+&_^5hi~saTS%(Pn6qb|-4IJ5(GWo|L9jP4K#uq;%E9 zdEv4ksd(v%i5(yn=ZOEyax2MqwL7+Yj;sCTUZH(F_8-m>)T-z$J0 z66lX-VA}6P)DudeVZIUy&TH^356{qi53hdc8QAW4#K`w{;#a`0Fh9k#)(;#D; zd_|b!e1!a_9+NG%eL%-=U7~G;P2~0|C%e$iM986&GVa~;jd6S()kkeq-~TJ~C_4<@ z*SiyeD4MB}KgPmM??9Gz5k|alvAbSdz~)E$lUij5uJyK(Z63pHn4-uQcqW%^ynB{^ zSC_t&oe?IWy&nhA5KAFnn|I4F#q+bU!5GFWS{sR*zlmNnJR{{PUUX%wg6YWdCCZoQ zsPR@7?4IWd?U84Qm;Wektj)#Vb7koM`XH7%&BU^gs&H>yKIUm>paH%{)4W!e>tR<+ zHxXE`Q&)+a&*zx=|A7zQH`9LeDjNA+6HV(9ftD!XuH`)l=+FRM4uq`CO1NLO0rKCt z4u6I|fyaH*Lycbbz}1!!;L77YzfoR;srQ~v`sGd096pxbzL3X%_afE3B+(a4m-oAT z%GQ@g$fZv|BaS%dE!pj6lHgZN;u8du|J_huqV~UjCK_D<8I4&@{bIZ6i&<0PX3Jbi z+!+WlK}tCM+FGa?ItYEUYbSHIca+eusRhI{<2iH1FibdK*QS=!CCi#=&)^8nH{7R< zmPa(=nhzAIGeHyUu7ZyvfGN_L2 zj98DBs!-G(%VZQ?38Y8f&TE`I-;71grTArK9%=?GI2Izqfg?Wbk0HS%*WSq(MqWil z(@#vs0W0ZSPH5~8r`a;GnXLK5hj6}+N?T0T#Rn;Yt2EBHlS-$*4W~9cgIXL63$}#A zk}p({csv|p%~$!lx&rG^InajLC#remyjRX>v4*y@o7Z^bB4Gi}otDDNlTw-Lht|n? nw^b;KF3)7suMTFrf3Y$S9Y#dW^bgthwiII=6Is7)Qis0*Y0@M? literal 0 HcmV?d00001 diff --git a/run.py b/run.py index b6f5f2c..2ce4784 100644 --- a/run.py +++ b/run.py @@ -12,7 +12,7 @@ torch.manual_seed(fix_seed) np.random.seed(fix_seed) - parser = argparse.ArgumentParser(description='TimesNet') + parser = argparse.ArgumentParser(description='Run Timeseries') # basic config parser.add_argument('--task_name', type=str, required=True, default='long_term_forecast', @@ -24,14 +24,16 @@ # data loader parser.add_argument('--data', type=str, required=True, default='ETTm1', help='dataset type') - parser.add_argument('--root_path', type=str, default='./data/ETT/', help='root path of the data file') - parser.add_argument('--data_path', type=str, default='ETTh1.csv', help='data file') + parser.add_argument('--root_path', type=str, default='./dataset/illness/', help='root path of the data file') + parser.add_argument('--data_path', type=str, default='national_illness.csv', help='data file') parser.add_argument('--features', type=str, default='M', help='forecasting task, options:[M, S, MS]; M:multivariate predict multivariate, S:univariate predict univariate, MS:multivariate predict univariate') parser.add_argument('--target', type=str, default='OT', help='target feature in S or MS task') parser.add_argument('--freq', type=str, default='h', help='freq for time features encoding, options:[s:secondly, t:minutely, h:hourly, d:daily, b:business days, w:weekly, m:monthly], you can also use more detailed freq like 15min or 3h') parser.add_argument('--checkpoints', type=str, default='./checkpoints/', help='location of model checkpoints') + parser.add_argument('--no-scale', action='store_true', help='do not scale the dataset') + parser.add_argument('--group-id', type=str, default=None, help='Group identifier id for multiple timeseries') # forecasting task parser.add_argument('--seq_len', type=int, default=96, help='input sequence length') @@ -39,12 +41,6 @@ parser.add_argument('--pred_len', type=int, default=96, help='prediction sequence length') parser.add_argument('--seasonal_patterns', type=str, default='Monthly', help='subset for M4') - # inputation task - parser.add_argument('--mask_rate', type=float, default=0.25, help='mask ratio') - - # anomaly detection task - parser.add_argument('--anomaly_ratio', type=float, default=0.25, help='prior anomaly ratio (%)') - # model define parser.add_argument('--top_k', type=int, default=5, help='for TimesBlock') parser.add_argument('--num_kernels', type=int, default=6, help='for Inception') diff --git a/scripts/Covid/Transformer.sh b/scripts/Covid/Transformer.sh new file mode 100644 index 0000000..75b5ebe --- /dev/null +++ b/scripts/Covid/Transformer.sh @@ -0,0 +1 @@ +python run.py --is_training 1 --root_path ./dataset/covid/ --data_path Top_20.csv --target Cases --model_id covid_14_14 --model Transformer --data covid --features MS --seq_len 14 --label_len 7 --pred_len 14 --e_layers 2 --d_layers 1 --factor 3 --enc_in 10 --dec_in 10 --c_out 10 --des 'Exp' --freq d --group-id 'FIPS' --train_epochs 2 --itr 1 --task_name long_term_forecast \ No newline at end of file diff --git a/scripts/ILI_script/Transformer_windows.sh b/scripts/ILI_script/Transformer_windows.sh index 346ff17..bc373d4 100644 --- a/scripts/ILI_script/Transformer_windows.sh +++ b/scripts/ILI_script/Transformer_windows.sh @@ -1 +1 @@ -python run.py --is_training 1 --root_path ./dataset/illness/ --data_path national_illness.csv --model_id ili_36_24 --model $model_name --data custom --features M --seq_len 36 --label_len 18 --pred_len 24 --e_layers 2 --d_layers 1 --factor 3 --enc_in 7 --dec_in 7 --c_out 7 --des 'Exp' --itr 1 --task_name long_term_forecast \ No newline at end of file +python run.py --is_training 1 --root_path ./dataset/illness/ --data_path national_illness.csv --model_id ili_36_24 --model Transformer --data custom --features M --seq_len 36 --label_len 18 --pred_len 24 --e_layers 2 --d_layers 1 --factor 3 --enc_in 7 --dec_in 7 --c_out 7 --des 'Exp' --itr 1 --task_name long_term_forecast \ No newline at end of file diff --git a/test_results/long_term_forecast_covid_14_14_Transformer_covid_ftMS_sl14_ll7_pl14_dm512_nh8_el2_dl1_df2048_fc3_ebtimeF_dtTrue_Exp_0/0.pdf b/test_results/long_term_forecast_covid_14_14_Transformer_covid_ftMS_sl14_ll7_pl14_dm512_nh8_el2_dl1_df2048_fc3_ebtimeF_dtTrue_Exp_0/0.pdf new file mode 100644 index 0000000000000000000000000000000000000000..05a2b24b4d7c03311e0d4d43d8b806272790c47d GIT binary patch literal 10623 zcmb_C2|SeD*9p!aS!bUyh3rd`HDt>eV=0DahPS*emJ(S)mPiXxl0v*v6j_q& zEmWu^QKUkaQoj2P-u&_Z>i7M=@w@JQ&Rx!O&vVXs?zyt&dIlrCfVE9145dUJ*W%_0|r<^nwkhI%?%!?HgAG~FO3OdSp$%E zX0Cgv6edKRdDP#}G_+!pnN$cj1288unN&Ir!o!aUq?ISx&5Py%5oUYoz7#7e6LJ7( z^$Y+h)cs5dY3u_?(3!1uXKQ1~X&yP~#SFl7!_?EMfVvs-k(N}3uRomv_<`vM_)V#9 zUSu8L{Qx5h{NYd}RWt!bz~CVaiKvQJ!>SSS01;9bun5*+K%^N$wP`e8@PvjZ%@+&Y zJtxQQRGJ6V6GG48Ht_NWoI^+hZy*jmD#h203Nyf9(y3%0#QwC8cBW<y&$gEn`&3{oLmlSI|-=8*K6->bLpZ<9A%o6e{a~?dbU2&L^tqb0FGTC4h6-ubNsC zlBMJ-Ov;mvD}B2D$(HBJv>2i2)gwW#UP=VoMSh^iC4?wmRf)lis5SUDeree7Lno>1 z>>;Q7djrTn^ChQ(e>`q#YG}qXXi3|7(IiEwODz_j-Dfw1-xyhSL(W{`du+JKMhq!E zJ4Yj8#Y=re@`HCg+tS8e(L#|@c^m`or%mvr z^b_dKHQM6B+zRoW$E3paw&tGe)=?c;cVmZ8a;l#?wEGag|G2=20}sz}`DEhm!+-yr z>K%IjdF?q$ z>S{w8wfi~!UCf52@=CToyCP%lM-b}Ie9L`A68Z7dSkn!Ss?Z*RR~Fk_iVf5xAAK|Y z68$aw&9SPyXJ(g+%Y};MWu&$qy^7nJhFsT;>_LT$ov`+j4$j|pzpwokkJYE;Qj@&f z%`b$TG-~A}bA7E;uRLSrm#-b-qvakb-pJp&Pf+q9x1q9{!s?;ijjL*OEGj(%82$eH z<8s)|WCl$1FIQ}m)!6Z2V_(1FSW3EJNYo>4-n++wLdC4x9tW%MO>{l)!>&y?;4zB%+|)1I^MRz}IxR^NHM zFHYIm_OGipUnWl9nt)_#lwJr64|gg#5A=FaE?FFOSZY#er|BTZ~= z`LZKM_aJXvJ;a6lF6yQm2fgUVnkN-TY~S*Eym+FDO}gZC7^9o z<;T~#nHr_N#0w*BXW5)jhnEKL?)|(>*Qv4dD_!M5Xx9n}$6d_NS6?-{>W0hpsOKH} z`sGsIyI%gTDZgv^z9lw6HSTLoJ+HKPq+EKff9;MHO0Or`xWK0Vc9rjt$AQl)2yS2B zjD_`&epyCscxryQmXQW4>pBp3>rV@ zHCXl%20B1a|H5EbvA||xa717ze`7FBEsSZ^DBjZVqEi*il5%Sfn4QoT_&A*>E57RE zJwX{0$2UXOdScGS=IuJid#VhaSDp#}6xxf1Fdw8bvP|y3B`AuqIK8 zOV2CD`B-N8Se3sIuJj*5p?prL;iaR)#)a!rA1k;&@XoIFQMADzh&fRwzFQmE3k(yQ zFhQq!j$-W10&?Xtrk=e(^}6OY&4Plz7(q?jF5aqMNT+hMi#WMrwk(}j96=CN;C>a7ZEa@^CwjBjho z(noULkLT^gqK^_&gx9$!{zVt~`t`Ef$By_|Z(DN9*>#zPsD`*H4M8hL3nM&kN@h2G zFS#Y<-RjjUSJz&nZf_kV=Xk5|eu&oo>*3cYf!9Y+LZ3$1UukQ)Ey3~!d-7?F)t11>#f`J2oh02S?{GJs2eC2H$DwD6daC~gK-SXioZY|m35?NP@%yJ}p zf)w645s$wpci4k&JEd%P%&_v>YvWbH$=%KYA@%F;Q@8SCCCid+nlkU7;(ee`&`=k` zJd+jNj4EB>|IGbG#9gWGLK(|OgQUmp!lREAIUZUSzj_}c>~r-S<9jZT&-WK1I|Bax zVB9Mb-j|eHLG2MeRb#h8!{U|+`cUi#?{xX}vQ6hhy*2lQ>m_gUcpY)?-li7*!CN0a z-t3g}bn3e7L&-MSR{OY|&(t)Yds1(uM=@?l?B|i@J9pc^3+#}x7Ft4J^8>Y6ZN*^G ze;_b3QapzgZ~cK|ny8&?I<(3!UoIv*n>=zmK4nMjXQ{f!BH}6gQ)FcF-D^AS-5~eZ zR@-aJ$BG0*a>idvNGC+{ki+8q+H!kn=VDe*2P8JzaeX38oY>_o_^fh!o2{6KY6mC9 zE3KMTM~iY2DTwyJcG=3#5S7k*lv!Mr05PmHN9$rR@A}o^28IwqGX@WQTE331J?+#rA)#N|FAJ?-Db{A;I+WIXboZSAro=y+gc&-i+NvT_ zjqepN38tFo7GAtyY2kN8^KyUg`6Io%9aWZ)7aTn#EEPcPs45{6Dctj`sfvD@0pi0xa3IRD-lN->#2h_M)DCk(lH;fF$(7t|31H8 zO!?yjf`fd###^Y{(lVkh>wLn)tv(ME5S!gwIh2KOy>=5ZRjV&Gwv1YvfcYSCDM0#A zuKI(EoBQ2Nn{s7Rop*Cwm-O`^s&4lI!Pjd+8-42!EL@kRX=@DN0sSSYnqvU+my6j#pXWEYVGc+ zgs3M6nVgDOaKj11EjtTacAvY=4gc+~^;bwB83g1F0jrHkyO&-~-67k$E;p>ckJ}RlKF%c)lApTf?IOBQguE6-Z=hLVAq^&F_ zEVETzb)3T2H>V4I)=1wgysO~O7n1VD6vkaH$zvrP*mO3^A!E#~?01P$B0G5fH}KnM zR+j8_t$JhMsvg}*`)=S)l{?CJ(oLsxO;9*Au>8gva<(RgF1mO?x1rj9Te*aSFN8b~wfVerP zt+iJo5HHN+P64+SEr>bj(#Jn^l#{3UfPo}uqMWm+9oIHWr};^(JKc1Zx=v03Klvjs zTCCHO9{=SPXS`FPGgJ7$G{1*jZ1#ax%Gq%q%6A#{C8&hY1r|_=Cj1GRv9ZJR+;Z;yN}GQI(btC%Xh?Z?a=*i7)sewJ4n8t zoi{GxYHQ}VfSR>Aqi>(OUcZ>(h}X{?TO|TE z4==&&5`i%B4?+nn=B7EyJRip<93}>$8ut~fsOIjKh+8_UtBrW8AF>tl1{KMsg@k(q zorZh_YalVbf|Q80dg5tparLjau`iSD5UX2)p80|0%%RZS=?>D$-<8QaCNl7)`#?yY znIjYxGzx=;RB!|=geC%QLSwL4`1EAf&tpl5-v;$P7#zw9MZoWT(2F4wmKV}1lw}w) z5YNU)7i~9e$)jbWBZz}b(7HrKhz4Z*7N36X7GZWNUfsWUq{~dmuxmEEV-yJEzLaO) zmKBffpwLp0kS8xfBKkv)7t3~We!9ltL793_py7qza`*??WcGL_SRT4niT`%{k7b+i*zO;xbC4icjE`E&Pl?SjN7;C zerDEo=C&VJ8r(_t+GSF|^5)r6cdeD3R-_kKHMUA3Hjgdl6}wc-iB|hPuNvmJqB+pK z^-jmu*;NF7yy77iX>w_L)M#={x>ZcyQW2jLuYmMVklkhVa`VBYYW@RZy)-w1tfD^h zRKDzZ3*yV6hf(|;Nr!o`iaG1WADa+T{S-xRU+ApDe&h3DvfCV&C~$yU zGF{q>A$qOA8s2*;B9YR(+e}wXU#Fiu?;N>av+9fPs~dLuL8xb_xB|ij>B?enidI_FU$D^WoaeyT^f$!D^nC`*ecx> z=XjOFoc0BeCZTv74b5ZD4qhZbz59V9`s*djkxv2%=LU1%gblDQ*IDPj1ZhhIbof7r zf3buj@Rp-+D6V*eFs-Dx2%QsBtBsabH}r6E9h1R+ZHLmM&$CW~r_?C4S@ts}* z^%9W~IPLts;^@bcgxS&D9E>|5@xNq35_zIaMtg%M}PdOhbmY*w$fK$j`L_l!IkHC zn_a>h#u4pmwIh7nLx=-QFuFu|fCZmwzqJN<5b|-NxjUK_O0=>*n>CH(*Zrmo3nFdGcdwgJsRQSmCN>l{~{{p_?iQ4hs|Bb+s+5ua^GCFzAV&uS?Hb#E{QBGWZtKBztk0D$sV%t8N+KIcLZH7;z(xM zxmWVnGSf#jcBRz6I%Sg8B5t~)pRML$jqwF10P@h*#G%MArok z5+ZyuIOX~zvSb^oS8dL(U0&v%`gCW%OQpndg(zE|!fXleVF!`KOz~?yYoDWD)}yRk z)4TIIpS*LWh79%g;PWcOKU$n}s`493YW$14)$`|iv4^iIqVYk7mkWgXywobfE6zEq z)i{rckoYUYDQ8oac`2T9mjh!WDq7AJW|8mLO4xrL5wL!L@c0rNu|$-C20r1p8=-&F zgr7r-TW>rqBc`HOvTq;e{V~S$$Dp$`5sJn!>)=}<=^Cng+0TkfO06yaa`Q@3jQ%c7 z>p)8zfx6enbMMznbzV^Z9Ac~(CS@LZdR?3fm3A|zgTQy=Mxs?#On&4=)XA|TZ&8We z84l9^@moWy6_K4ix5Xc1wUxfih`f6wlMTIc32v5%MlgT!?y6WA)7iDTOPiHOkGfxe zMM+jV{&u>Z`ytl~zDO;m{hJF>=OJa=vfj9>o>i}~=#W>O#}6yD``EqJvWAQ)=J@o0 zFM*>H&-R^O*>I{YpVvKCU@A(Xv%jVrz1HJqJN1d+pi`bkigsPDW3{Zv>g`X-G9LU} zQ!uJU3=JaGXJD61j-}bG!wSdXju}ZE%Q=(T#Gx%5b=@GEKwtGe3we?mwlg{#J;+e; z+1+fawzp-_E_e8Cf`rpPRr%G(+zq2&3GAyaymE)Nj4FQ61imtEoO-q2q_0WranIk4 zb;Fg`E;VP`X+74GV&CoJ5Wa*Amk4su#6O7WOk<7tb%CQEPpe#= zeH2^Kr(T|O+w)WTHC-zKyC(xBjRYr866PDheUWn^v{yx)RN5w02j9 zid}L*?ehjOUuv@a^-3mzB>Exu%$gWeP%{RAnoxEM-zFQ-s zB4UucCZEkV=*R8aV3W#8=f`i86=f+R$A^PINLv!NqznFIJxI?vQP_1x$xn22s#SOB zBj-Od>aUhydwwi6ckF@wgO5j-Wb(tBXQ3$tSCV$ zW!Ummwp6c`W?+hH+U?HvhCR5xq#_kYP)Tta@>+aFgnM}CJ+IRxY^myOy6vsm7P_k? zTApNCZOZTD&^&RICeTIsCpfuSsxJa^-LzI&yYY&?h?q~ad?vmecXR{-o{9|E;~rszpWGM9-NRnCX*Bn-a}u+Xzn z(xdJn+xT0NX$;tk&>2i!Pcj`$Ms6q1wlR1-h%-ULPLIl<(7pVazH|sZn|}N^ZiR6G zJG6uB1CoR~3zwdm|C^{5Mqx^3(!E&OLsb+C4m!Z!zbz*O5+=?ZBsD>*kd@*@AoXb= z^9QFqf2CUIKnSFsm%BR^BrM?!r2~WmN1A?Ax|gpTgahGSAeHV5A$@6pVCG-G5FUIQ zxPxpa`0o_}U<^P99?YP6d3b{SD+nnJAvGWubA%dry{sV;I5eU_NH;J7aldlj>swH%!UZel<>8cRa43Lel2_rv3{1oTXeCBozXw}P1K$AYnNf!CNTcyJVpM!^CAN<&Zs z3z5_yBFH8Xz@auCASc2Fs2+xdM}PtmfXd%ek>>;J5L1j z;WbHWBv8-5;F`q@mWY6BmiB(Z7|@F4^M zC!jHa!5O?bfEN!wNUHAuOtJ;9Uo7v&e__;6F%(d15IOP$+oTzpKDRp->1`-|PY;0tuQcup-Rp z0IUc9Sy-|LdI9Uiyu#84SSRKROIK!f0@jE51*VJj4Ay{kW3IrQ6Hs`thk#aC2bL%# zHLxeJUI4E#zs%{xTn!5a>^p*0!|i#6wbwKK@R)xKtQYXE!u78d<@~_U5S(oQOJWHu zD+GkdYFR_**^40*h?Nooy>1ZLSynFb*I~eO{Thb-uftHPz$b!%aIL91kJ@>m#4i%(?cguq{8-cgt@10hQ1S7i`GZSO z3k5O%+kM98&D@Scns~Xv9vdhxEIHPNA%EbbSiH_A#If^_sC2=lA2=@A1Of;@Z(pXj zmn#&2RRvd$lp#+h(~q$kiS+rmOVyX|p@2YwK*G(R0`3sapXKM~4!M#kdjXc&6~IW= z@?O3)J^1F;CcVuV6b6eTfqPBhD;Te)g2Kt8Q1S}%WVWA9bw|MWpAf(}&HjLG#{+F~ zhyI1ZF%0~JXmcX|{kpO`UVQ2#IoeRdHNx)t%fMGE3 z3kd&y$r*zsE?N&mPy_0=U>pfJpan227PUyGSlBi%7zZxDEgFZz0KdL)9PCFH!U&6H zk5gN$FL)I2rwiudvA9Jz@HpUK7LFsp1{MDOk_%o9zi8hG@a>TW;|RFLdr2S?7mXvL zV0-+ZI7oQlyZ(hS>0~c&DxGyx(#k6kTx8