add setup py and LICENSE

GishB · Mar 20, 2024 · 20d9e9f · 20d9e9f
1 parent f2d4952
commit 20d9e9f
Show file tree

Hide file tree

Showing 24 changed files with 1,573 additions and 13 deletions.
diff --git a/build/lib/data/CloudData.py b/build/lib/data/CloudData.py
@@ -0,0 +1,58 @@
+import pandas as pd
+from io import StringIO
+import requests
+
+
+class DrillingData:
+ """ This class helps to load las files for one of selected horizontal well.
+
+ Notes:
+ - If you define name which does not exist then program will raise NameError!
+ - All data are restored on public yandex cloud server.
+ - If you need to load just raw data then use load_raw_data method.
+
+ Attributes:
+ dataset_name: indicate which well you need to load data.
+ """
+
+ def __init__(self,
+ dataset_name: str = "default",
+ sep: str = ","):
+ self.url_dict = {
+ "229G": "https://storage.yandexcloud.net/cloud-files-public/229G_las_files.csv",
+ "231G": "https://storage.yandexcloud.net/cloud-files-public/231G_las_files.csv",
+ "237G": "https://storage.yandexcloud.net/cloud-files-public/237G_las_files.csv",
+ "xxxAA564G": "https://storage.yandexcloud.net/cloud-files-public/dataframe.csv",
+ "xxxAA684G": "https://storage.yandexcloud.net/cloud-files-public/dataframe.csv"
+ }
+ self.dataset_name = dataset_name
+ self.sep = sep
+ if dataset_name not in ["default", "229G", "231G", "237G", "xxxAA684G", "xxxAA564G"]:
+ raise NameError("There is not such dataset name.")
+ if dataset_name in ["xxxAA684G", "xxxAA564G"]:
+ self.sep = "|"
+
+ def load_raw_data(self, url: str) -> pd.DataFrame:
+ """ Load las files as it is in pandas format.
+
+ Warning:
+ - These files are available only for education purposes and shall not be used for any other points.
+
+ Notes:
+ - value like -9999 means that data has been missing.
+ - unitless column means type of layer.
+ - uR/h the most important drilling data columns for analysis.
+
+ Returns:
+ pandas dataframe with all available columns and rows from chosen las file.
+ """
+ return pd.read_csv(StringIO(requests.get(url).content.decode('utf-8')), sep=self.sep)
+
+ def get(self) -> pd.DataFrame:
+ if self.dataset_name == "default":
+ raw_data = self.load_raw_data(url=self.url_dict.get("237G"))
+ else:
+ raw_data = self.load_raw_data(url=self.url_dict.get(self.dataset_name))
+ if self.dataset_name in ["xxxAA684G", "xxxAA564G"]:
+ raw_data = raw_data[raw_data[raw_data.columns[0]] == self.dataset_name]
+ return raw_data.reset_index(drop=True)
diff --git a/build/lib/data/DataTransformers.py b/build/lib/data/DataTransformers.py
@@ -0,0 +1,72 @@
+import pandas as pd
+import numpy as np
+
+
+class CloudDataTransformer:
+ """ This class helps to transform raw data as it expected for experiments in the project.
+
+ Attributes:
+ df: dataframe which was loaded from the cloud.
+ dataset_name: dataset name which was loaded via CloudData
+
+ """
+
+ def __init__(self, df: pd.DataFrame = None,
+ dataset_name: str = "default"):
+ self.df = df
+ self.dataset_name = dataset_name
+
+ if self.df is None:
+ raise ValueError("dataset has not been defined!")
+
+ if self.dataset_name not in ["default", "229G", "231G", "237G", "xxxAA684G", "xxxAA564G"]:
+ raise NameError("There is not such dataset name.")
+
+ @staticmethod
+ def add_time_column(df: pd.DataFrame) -> pd.DataFrame:
+ df['time'] = np.arange(0, df.shape[0] * 1, 1).astype('datetime64[s]')
+ df = df.set_index('time')
+ return df
+
+ @staticmethod
+ def replace_nan_values(df: pd.DataFrame) -> pd.DataFrame:
+ return df.replace(['-9999', -9999, 'missing', '#'], np.nan)
+
+ def drop_nan_values(self, df: pd.DataFrame) -> pd.DataFrame:
+ df = self.replace_nan_values(df)
+ df = df[df['unitless'].notna()]
+ df = df[df['uR/h'].notna()]
+ return df
+
+ @staticmethod
+ def add_expected_change_points(df: pd.DataFrame) -> pd.DataFrame:
+ cps_list = [1 if df['unitless'].iloc[i] != df['unitless'].iloc[i + 1] else 0 for i in range(df.shape[0] - 1)]
+ df['CPs'] = cps_list + [0]
+ return df
+
+ @staticmethod
+ def take_expected_columns(df: pd.DataFrame) -> pd.DataFrame:
+ return df[["uR/h", "ohmm", "ohmm.6", "m/hr", "unitless", "CPs"]].reset_index(drop=True)
+
+ @staticmethod
+ def rename_column_special(df: pd.DataFrame) -> pd.DataFrame:
+ return df.rename(columns={"uR/h": "GR",
+ "ohmm": "Resist_short",
+ "ohmm.6": "Resist_long",
+ "unitless": "LITHOLOGY",
+ "m/hr": "DrillingSpeed"})
+
+ def transform(self) -> pd.DataFrame:
+ """ Transform data initial point.
+
+ Returns:
+ DataFrame as it expected to be for any future tasks.
+ """
+ df = self.df
+ if self.dataset_name in ["xxxAA684G", "xxxAA564G"]:
+ df = self.replace_nan_values(df)
+ df = self.add_expected_change_points(df)
+ df = self.take_expected_columns(df)
+ df = self.rename_column_special(df)
+ df = self.add_time_column(df)
+ return df
diff --git a/build/lib/data/SythData.py b/build/lib/data/SythData.py
@@ -0,0 +1,173 @@
+import numpy as np
+import pandas as pd
+
+
+class SythDataConstructor:
+ """ This is fundament class which should be used for any syth data generators.
+
+ Notes:
+ length_data % cpu_numbers has to be equal 0!
+
+ Attributes:
+ frequency: which freq should be used seconds, minutes, days.
+ length_data: just how many points should be generated.
+ cps_number: number of change points over generated data.
+ """
+
+ def __init__(self,
+ white_noise_level: str = "default",
+ frequency: str = "s",
+ length_data: int = 24 * 7 * 15 + 15,
+ cps_number: int = 15):
+ self.frequency = frequency
+ self.length_data = length_data
+ self.cps_number = cps_number
+
+ self.white_mean = 0
+ if white_noise_level == "default":
+ self.white_std = 0.5
+ elif white_noise_level == "max":
+ self.white_std = 1
+ elif white_noise_level == "min":
+ self.white_std = 0.01
+ else:
+ raise NameError("Not implemented white noise level!")
+
+ if length_data % cps_number != 0:
+ raise ValueError("Not equal length of data and cpu_numbers expected from syth data!")
+
+ def generate_empty_df(self) -> pd.DataFrame:
+ """ Generate dataframe with timestamps.
+
+ Returns:
+ pandas dataframe with expected frequency and length
+ """
+ return pd.DataFrame(index=pd.date_range(start="10/07/1999",
+ periods=self.length_data,
+ freq=self.frequency,
+ normalize=True,
+ inclusive="both",
+ name="time"))
+
+ def generate_white_noise(self) -> np.array:
+ """ Generate random noise for your data.
+
+ Returns:
+ array of white noise based on expected length of data.
+ """
+ return np.random.normal(self.white_mean,
+ self.white_std,
+ size=self.length_data)
+
+ def generate_array_of_change_points(self) -> np.array:
+ """ Generate values which represent CPs over syth data.
+
+ Returns:
+ numpy array of int values where 1 is change point and 0 is default value.
+ """
+ cps_index = [i for i in range(self.length_data // self.cps_number,
+ self.length_data,
+ self.length_data // self.cps_number)]
+ dp = [0 if i not in cps_index else 1 for i in range(self.length_data)]
+ return np.array(dp)
+
+ def generate_data(self) -> np.array:
+ """ Generate syth data array
+
+ Returns:
+ expected syth data based on class idea.
+ """
+ ...
+
+ def get(self) -> pd.DataFrame:
+ """ Get syth data.
+
+ Returns:
+ pandas dataframe with syth data and time index.
+ """
+ ...
+
+
+class LinearSteps(SythDataConstructor):
+ def get_linear_array(self,
+ beta_past: float,
+ k_past: float,
+ beta_mutation_coeff: float,
+ k_mutation_coeff: float) -> tuple[np.array, float, float]:
+ """ Generate random linear array based on past observation
+
+ Notes:
+ beta_mutation_coeff as well as k_mutation_coeff should be defined based on expertise. These coefficients
+ help to connect nearest arrays.
+
+ Args:
+ beta_past: beta value in the past array.
+ k_past: k coefficient in the past array.
+ beta_mutation_coeff: treshold for beta deviation.
+ k_mutation_coeff: treshold for k coeff deviation.
+
+ Returns:
+ tuple of generated data and info for this generations beta and k_coeff.
+ """
+ beta = np.random.uniform(beta_past, 1)
+ k_coeff = np.random.uniform(k_past, 1)
+ if np.random.uniform(0, 1) > beta_mutation_coeff:
+ beta = np.random.uniform(-1, 1)
+ if np.random.uniform(0, 1) > k_mutation_coeff:
+ k_coeff = np.random.uniform(-1, 1)
+ dp = [k_coeff * x + beta for x in range(0, self.length_data // self.cps_number)]
+ return np.array(dp), beta, k_coeff
+
+ def generate_data(self, initial_beta: float = -0.01,
+ initial_k: float = 0.2,
+ beta_mutation_coeff: float = 0.8,
+ k_mutation_coeff: float = 0.2) -> np.array:
+ dp = []
+ for steps in range(self.cps_number):
+ temp_info = self.get_linear_array(initial_beta,
+ initial_k,
+ beta_mutation_coeff,
+ k_mutation_coeff)
+ dp.extend(temp_info[0])
+ initial_beta = temp_info[1]
+ initial_k = temp_info[2]
+ return np.array(dp)
+
+ def get(self):
+ df = self.generate_empty_df()
+ df['x'] = np.add(self.generate_data(), self.generate_white_noise())
+ df['CPs'] = self.generate_array_of_change_points()
+ return df
+
+
+class SinusoidWaves(SythDataConstructor):
+ def get_sinusoid_array(self, beta_past: float, beta_mutation_coeff: float) -> tuple[np.array, float]:
+ """ Generate sinusoid waves over expected shape.
+
+ Args:
+ beta_past: beta coefficient for sinus wave.
+ beta_mutation_coeff: coeff for mutation operator.
+
+ Returns:
+ array of sinusoid data
+ """
+ beta_past = np.random.uniform(low=beta_past, high=2)
+ if np.random.uniform(low=0, high=1) > beta_mutation_coeff:
+ beta_past = np.random.uniform(low=-2, high=2)
+ x = np.linspace(start=0, stop= self.length_data // self.cps_number, num=self.length_data // self.cps_number)
+ return np.sin(x) * beta_past, beta_past
+
+ def generate_data(self, initial_beta: float = 0.5, beta_mutation_coeff: float = 0.5) -> np.array:
+ dp = []
+ for steps in range(self.cps_number):
+ temp_info = self.get_sinusoid_array(initial_beta,
+ beta_mutation_coeff)
+ dp.extend(temp_info[0])
+ initial_beta = temp_info[1]
+ return np.array(dp)
+
+ def get(self):
+ df = self.generate_empty_df()
+ df['x'] = np.add(self.generate_data(), self.generate_white_noise())
+ df['CPs'] = self.generate_array_of_change_points()
+ return df
diff --git a/build/lib/data/__init__.py b/build/lib/data/__init__.py
diff --git a/build/lib/examples/DefaultStreamlitApp.py b/build/lib/examples/DefaultStreamlitApp.py
@@ -0,0 +1,38 @@
+import streamlit as st
+import sys
+
+sys.path.append("..")
+
+import utils.StreamlitFunctions as useful
+
+st.title('Change Point Detection examples.')
+
+st.sidebar.header('UI Model pipeline')
+option_model = st.sidebar.selectbox(
+ 'Select CPD model',
+ ("Singular Sequence Decomposition", "Kalman Filter"))
+model_params = useful.init_model_params(model_name=option_model)
+
+option_data = st.sidebar.selectbox(
+ 'Select dataset',
+ ("None", "Syth-Steps", "Syth-Sinusoid"))
+
+df = None
+if option_data != "None":
+ data_loader_params = useful.init_data_loader_params()
+ df = useful.data_loader(option=option_data, params=data_loader_params)
+ useful.data_info(df)
+ useful.data_plot(df)
+
+option_start_model = st.sidebar.selectbox(
+ 'Run selected model',
+ ("None", "RUN!"))
+
+df_updated = None
+if option_start_model != "None" and df is not None:
+ df_updated = useful.init_model_pipeline(name_model=option_model, params=model_params, df=df)
+
+if df_updated is not None:
+ summary_df = useful.model_summary(df=df_updated)
+ useful.data_info(summary_df)
+ useful.plot_results(df_updated)
diff --git a/build/lib/examples/__init__.py b/build/lib/examples/__init__.py
diff --git a/build/lib/experiments/OfflineCPD_experiments/__init__.py b/build/lib/experiments/OfflineCPD_experiments/__init__.py
diff --git a/build/lib/experiments/OnlineCPD_experiments/__init__.py b/build/lib/experiments/OnlineCPD_experiments/__init__.py
diff --git a/build/lib/experiments/OnlineCPD_experiments/kalman_runner.py b/build/lib/experiments/OnlineCPD_experiments/kalman_runner.py
@@ -0,0 +1,45 @@
+import sys
+import os
+sys.path.append(os.path.abspath("../.."))
+
+from utils import libs_cpd, Reports as crtest
+import data.CloudData as dtest
+import models.ProbabilityBased as kalman
+import utils.GeneralFunctions as optf
+
+
+if len(sys.argv) > 1:
+ i = dtest.list_links[int(sys.argv[1])];
+ name = str(sys.argv[2]);
+else:
+ i = dtest.list_links[0]; 
+ name = "test";
+
+df = dtest.df_expirement(i)
+
+window_length_savgol = libs_cpd.WindowSizeSelection(time_series = list(df.GR),
+ wss_algorithm = 'summary_statistics_subsequence').get_window_size()[0]
+norm_filter_gr = optf.normalization_linear(optf.filter_Savgol(df.Resist_short, window_length_savgol))
+window_length = libs_cpd.WindowSizeSelection(time_series = norm_filter_gr,
+ wss_algorithm = 'dominant_fourier_frequency', window_max=1000, window_min=50).get_window_size()[0]
+
+cps_list_kalman = kalman.online_detection(list(df['GR']), window=window_length, queue_window=10, treshold_coef=4.3)
+df['cps_kalman'] = cps_list_kalman
+
+tsad_average_results = crtest.tsad_average(df.cps_kalman, df.CPs)
+tsad_nab_results = crtest.tsad_nab(df.cps_kalman, df.CPs)
+tsad_nab_results.update(tsad_average_results)
+
+report = crtest.create_report(tsad_nab_results)
+
+
+#downloand report and image with predicted labels
+libs_cpd.plt.figure(figsize=(12, 3))
+libs_cpd.plt.plot(list(df.CPs), label='original CPs')
+libs_cpd.plt.plot(cps_list_kalman, label='predicted CPs')
+libs_cpd.plt.legend(loc="center right", bbox_to_anchor=(1.18, 0.5))
+libs_cpd.plt.savefig('predicted_' + name + '.jpg')
+
+with open('./predicted_'+name+'.txt','w') as out:
+ for key,val in report.items():
+ out.write('{}:{}\n'.format(key,val))
diff --git a/build/lib/experiments/OnlineCPD_experiments/kalman_sst_runner.py b/build/lib/experiments/OnlineCPD_experiments/kalman_sst_runner.py
diff --git a/build/lib/experiments/OnlineCPD_experiments/sst_runner.py b/build/lib/experiments/OnlineCPD_experiments/sst_runner.py