-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
24 changed files
with
1,573 additions
and
13 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
import pandas as pd | ||
from io import StringIO | ||
import requests | ||
|
||
|
||
class DrillingData: | ||
""" This class helps to load las files for one of selected horizontal well. | ||
Notes: | ||
- If you define name which does not exist then program will raise NameError! | ||
- All data are restored on public yandex cloud server. | ||
- If you need to load just raw data then use load_raw_data method. | ||
Attributes: | ||
dataset_name: indicate which well you need to load data. | ||
""" | ||
|
||
def __init__(self, | ||
dataset_name: str = "default", | ||
sep: str = ","): | ||
self.url_dict = { | ||
"229G": "https://storage.yandexcloud.net/cloud-files-public/229G_las_files.csv", | ||
"231G": "https://storage.yandexcloud.net/cloud-files-public/231G_las_files.csv", | ||
"237G": "https://storage.yandexcloud.net/cloud-files-public/237G_las_files.csv", | ||
"xxxAA564G": "https://storage.yandexcloud.net/cloud-files-public/dataframe.csv", | ||
"xxxAA684G": "https://storage.yandexcloud.net/cloud-files-public/dataframe.csv" | ||
} | ||
self.dataset_name = dataset_name | ||
self.sep = sep | ||
if dataset_name not in ["default", "229G", "231G", "237G", "xxxAA684G", "xxxAA564G"]: | ||
raise NameError("There is not such dataset name.") | ||
if dataset_name in ["xxxAA684G", "xxxAA564G"]: | ||
self.sep = "|" | ||
|
||
def load_raw_data(self, url: str) -> pd.DataFrame: | ||
""" Load las files as it is in pandas format. | ||
Warning: | ||
- These files are available only for education purposes and shall not be used for any other points. | ||
Notes: | ||
- value like -9999 means that data has been missing. | ||
- unitless column means type of layer. | ||
- uR/h the most important drilling data columns for analysis. | ||
Returns: | ||
pandas dataframe with all available columns and rows from chosen las file. | ||
""" | ||
return pd.read_csv(StringIO(requests.get(url).content.decode('utf-8')), sep=self.sep) | ||
|
||
def get(self) -> pd.DataFrame: | ||
if self.dataset_name == "default": | ||
raw_data = self.load_raw_data(url=self.url_dict.get("237G")) | ||
else: | ||
raw_data = self.load_raw_data(url=self.url_dict.get(self.dataset_name)) | ||
if self.dataset_name in ["xxxAA684G", "xxxAA564G"]: | ||
raw_data = raw_data[raw_data[raw_data.columns[0]] == self.dataset_name] | ||
return raw_data.reset_index(drop=True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
import pandas as pd | ||
import numpy as np | ||
|
||
|
||
class CloudDataTransformer: | ||
""" This class helps to transform raw data as it expected for experiments in the project. | ||
Attributes: | ||
df: dataframe which was loaded from the cloud. | ||
dataset_name: dataset name which was loaded via CloudData | ||
""" | ||
|
||
def __init__(self, df: pd.DataFrame = None, | ||
dataset_name: str = "default"): | ||
self.df = df | ||
self.dataset_name = dataset_name | ||
|
||
if self.df is None: | ||
raise ValueError("dataset has not been defined!") | ||
|
||
if self.dataset_name not in ["default", "229G", "231G", "237G", "xxxAA684G", "xxxAA564G"]: | ||
raise NameError("There is not such dataset name.") | ||
|
||
@staticmethod | ||
def add_time_column(df: pd.DataFrame) -> pd.DataFrame: | ||
df['time'] = np.arange(0, df.shape[0] * 1, 1).astype('datetime64[s]') | ||
df = df.set_index('time') | ||
return df | ||
|
||
@staticmethod | ||
def replace_nan_values(df: pd.DataFrame) -> pd.DataFrame: | ||
return df.replace(['-9999', -9999, 'missing', '#'], np.nan) | ||
|
||
def drop_nan_values(self, df: pd.DataFrame) -> pd.DataFrame: | ||
df = self.replace_nan_values(df) | ||
df = df[df['unitless'].notna()] | ||
df = df[df['uR/h'].notna()] | ||
return df | ||
|
||
@staticmethod | ||
def add_expected_change_points(df: pd.DataFrame) -> pd.DataFrame: | ||
cps_list = [1 if df['unitless'].iloc[i] != df['unitless'].iloc[i + 1] else 0 for i in range(df.shape[0] - 1)] | ||
df['CPs'] = cps_list + [0] | ||
return df | ||
|
||
@staticmethod | ||
def take_expected_columns(df: pd.DataFrame) -> pd.DataFrame: | ||
return df[["uR/h", "ohmm", "ohmm.6", "m/hr", "unitless", "CPs"]].reset_index(drop=True) | ||
|
||
@staticmethod | ||
def rename_column_special(df: pd.DataFrame) -> pd.DataFrame: | ||
return df.rename(columns={"uR/h": "GR", | ||
"ohmm": "Resist_short", | ||
"ohmm.6": "Resist_long", | ||
"unitless": "LITHOLOGY", | ||
"m/hr": "DrillingSpeed"}) | ||
|
||
def transform(self) -> pd.DataFrame: | ||
""" Transform data initial point. | ||
Returns: | ||
DataFrame as it expected to be for any future tasks. | ||
""" | ||
df = self.df | ||
if self.dataset_name in ["xxxAA684G", "xxxAA564G"]: | ||
df = self.replace_nan_values(df) | ||
df = self.add_expected_change_points(df) | ||
df = self.take_expected_columns(df) | ||
df = self.rename_column_special(df) | ||
df = self.add_time_column(df) | ||
return df |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,173 @@ | ||
import numpy as np | ||
import pandas as pd | ||
|
||
|
||
class SythDataConstructor: | ||
""" This is fundament class which should be used for any syth data generators. | ||
Notes: | ||
length_data % cpu_numbers has to be equal 0! | ||
Attributes: | ||
frequency: which freq should be used seconds, minutes, days. | ||
length_data: just how many points should be generated. | ||
cps_number: number of change points over generated data. | ||
""" | ||
|
||
def __init__(self, | ||
white_noise_level: str = "default", | ||
frequency: str = "s", | ||
length_data: int = 24 * 7 * 15 + 15, | ||
cps_number: int = 15): | ||
self.frequency = frequency | ||
self.length_data = length_data | ||
self.cps_number = cps_number | ||
|
||
self.white_mean = 0 | ||
if white_noise_level == "default": | ||
self.white_std = 0.5 | ||
elif white_noise_level == "max": | ||
self.white_std = 1 | ||
elif white_noise_level == "min": | ||
self.white_std = 0.01 | ||
else: | ||
raise NameError("Not implemented white noise level!") | ||
|
||
if length_data % cps_number != 0: | ||
raise ValueError("Not equal length of data and cpu_numbers expected from syth data!") | ||
|
||
def generate_empty_df(self) -> pd.DataFrame: | ||
""" Generate dataframe with timestamps. | ||
Returns: | ||
pandas dataframe with expected frequency and length | ||
""" | ||
return pd.DataFrame(index=pd.date_range(start="10/07/1999", | ||
periods=self.length_data, | ||
freq=self.frequency, | ||
normalize=True, | ||
inclusive="both", | ||
name="time")) | ||
|
||
def generate_white_noise(self) -> np.array: | ||
""" Generate random noise for your data. | ||
Returns: | ||
array of white noise based on expected length of data. | ||
""" | ||
return np.random.normal(self.white_mean, | ||
self.white_std, | ||
size=self.length_data) | ||
|
||
def generate_array_of_change_points(self) -> np.array: | ||
""" Generate values which represent CPs over syth data. | ||
Returns: | ||
numpy array of int values where 1 is change point and 0 is default value. | ||
""" | ||
cps_index = [i for i in range(self.length_data // self.cps_number, | ||
self.length_data, | ||
self.length_data // self.cps_number)] | ||
dp = [0 if i not in cps_index else 1 for i in range(self.length_data)] | ||
return np.array(dp) | ||
|
||
def generate_data(self) -> np.array: | ||
""" Generate syth data array | ||
Returns: | ||
expected syth data based on class idea. | ||
""" | ||
... | ||
|
||
def get(self) -> pd.DataFrame: | ||
""" Get syth data. | ||
Returns: | ||
pandas dataframe with syth data and time index. | ||
""" | ||
... | ||
|
||
|
||
class LinearSteps(SythDataConstructor): | ||
def get_linear_array(self, | ||
beta_past: float, | ||
k_past: float, | ||
beta_mutation_coeff: float, | ||
k_mutation_coeff: float) -> tuple[np.array, float, float]: | ||
""" Generate random linear array based on past observation | ||
Notes: | ||
beta_mutation_coeff as well as k_mutation_coeff should be defined based on expertise. These coefficients | ||
help to connect nearest arrays. | ||
Args: | ||
beta_past: beta value in the past array. | ||
k_past: k coefficient in the past array. | ||
beta_mutation_coeff: treshold for beta deviation. | ||
k_mutation_coeff: treshold for k coeff deviation. | ||
Returns: | ||
tuple of generated data and info for this generations beta and k_coeff. | ||
""" | ||
beta = np.random.uniform(beta_past, 1) | ||
k_coeff = np.random.uniform(k_past, 1) | ||
if np.random.uniform(0, 1) > beta_mutation_coeff: | ||
beta = np.random.uniform(-1, 1) | ||
if np.random.uniform(0, 1) > k_mutation_coeff: | ||
k_coeff = np.random.uniform(-1, 1) | ||
dp = [k_coeff * x + beta for x in range(0, self.length_data // self.cps_number)] | ||
return np.array(dp), beta, k_coeff | ||
|
||
def generate_data(self, initial_beta: float = -0.01, | ||
initial_k: float = 0.2, | ||
beta_mutation_coeff: float = 0.8, | ||
k_mutation_coeff: float = 0.2) -> np.array: | ||
dp = [] | ||
for steps in range(self.cps_number): | ||
temp_info = self.get_linear_array(initial_beta, | ||
initial_k, | ||
beta_mutation_coeff, | ||
k_mutation_coeff) | ||
dp.extend(temp_info[0]) | ||
initial_beta = temp_info[1] | ||
initial_k = temp_info[2] | ||
return np.array(dp) | ||
|
||
def get(self): | ||
df = self.generate_empty_df() | ||
df['x'] = np.add(self.generate_data(), self.generate_white_noise()) | ||
df['CPs'] = self.generate_array_of_change_points() | ||
return df | ||
|
||
|
||
class SinusoidWaves(SythDataConstructor): | ||
def get_sinusoid_array(self, beta_past: float, beta_mutation_coeff: float) -> tuple[np.array, float]: | ||
""" Generate sinusoid waves over expected shape. | ||
Args: | ||
beta_past: beta coefficient for sinus wave. | ||
beta_mutation_coeff: coeff for mutation operator. | ||
Returns: | ||
array of sinusoid data | ||
""" | ||
beta_past = np.random.uniform(low=beta_past, high=2) | ||
if np.random.uniform(low=0, high=1) > beta_mutation_coeff: | ||
beta_past = np.random.uniform(low=-2, high=2) | ||
x = np.linspace(start=0, stop= self.length_data // self.cps_number, num=self.length_data // self.cps_number) | ||
return np.sin(x) * beta_past, beta_past | ||
|
||
def generate_data(self, initial_beta: float = 0.5, beta_mutation_coeff: float = 0.5) -> np.array: | ||
dp = [] | ||
for steps in range(self.cps_number): | ||
temp_info = self.get_sinusoid_array(initial_beta, | ||
beta_mutation_coeff) | ||
dp.extend(temp_info[0]) | ||
initial_beta = temp_info[1] | ||
return np.array(dp) | ||
|
||
def get(self): | ||
df = self.generate_empty_df() | ||
df['x'] = np.add(self.generate_data(), self.generate_white_noise()) | ||
df['CPs'] = self.generate_array_of_change_points() | ||
return df |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
import streamlit as st | ||
import sys | ||
|
||
sys.path.append("..") | ||
|
||
import utils.StreamlitFunctions as useful | ||
|
||
st.title('Change Point Detection examples.') | ||
|
||
st.sidebar.header('UI Model pipeline') | ||
option_model = st.sidebar.selectbox( | ||
'Select CPD model', | ||
("Singular Sequence Decomposition", "Kalman Filter")) | ||
model_params = useful.init_model_params(model_name=option_model) | ||
|
||
option_data = st.sidebar.selectbox( | ||
'Select dataset', | ||
("None", "Syth-Steps", "Syth-Sinusoid")) | ||
|
||
df = None | ||
if option_data != "None": | ||
data_loader_params = useful.init_data_loader_params() | ||
df = useful.data_loader(option=option_data, params=data_loader_params) | ||
useful.data_info(df) | ||
useful.data_plot(df) | ||
|
||
option_start_model = st.sidebar.selectbox( | ||
'Run selected model', | ||
("None", "RUN!")) | ||
|
||
df_updated = None | ||
if option_start_model != "None" and df is not None: | ||
df_updated = useful.init_model_pipeline(name_model=option_model, params=model_params, df=df) | ||
|
||
if df_updated is not None: | ||
summary_df = useful.model_summary(df=df_updated) | ||
useful.data_info(summary_df) | ||
useful.plot_results(df_updated) |
Empty file.
Empty file.
Empty file.
45 changes: 45 additions & 0 deletions
45
build/lib/experiments/OnlineCPD_experiments/kalman_runner.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
import sys | ||
import os | ||
sys.path.append(os.path.abspath("../..")) | ||
|
||
from utils import libs_cpd, Reports as crtest | ||
import data.CloudData as dtest | ||
import models.ProbabilityBased as kalman | ||
import utils.GeneralFunctions as optf | ||
|
||
|
||
if len(sys.argv) > 1: | ||
i = dtest.list_links[int(sys.argv[1])]; | ||
name = str(sys.argv[2]); | ||
else: | ||
i = dtest.list_links[0]; | ||
name = "test"; | ||
|
||
df = dtest.df_expirement(i) | ||
|
||
window_length_savgol = libs_cpd.WindowSizeSelection(time_series = list(df.GR), | ||
wss_algorithm = 'summary_statistics_subsequence').get_window_size()[0] | ||
norm_filter_gr = optf.normalization_linear(optf.filter_Savgol(df.Resist_short, window_length_savgol)) | ||
window_length = libs_cpd.WindowSizeSelection(time_series = norm_filter_gr, | ||
wss_algorithm = 'dominant_fourier_frequency', window_max=1000, window_min=50).get_window_size()[0] | ||
|
||
cps_list_kalman = kalman.online_detection(list(df['GR']), window=window_length, queue_window=10, treshold_coef=4.3) | ||
df['cps_kalman'] = cps_list_kalman | ||
|
||
tsad_average_results = crtest.tsad_average(df.cps_kalman, df.CPs) | ||
tsad_nab_results = crtest.tsad_nab(df.cps_kalman, df.CPs) | ||
tsad_nab_results.update(tsad_average_results) | ||
|
||
report = crtest.create_report(tsad_nab_results) | ||
|
||
|
||
#downloand report and image with predicted labels | ||
libs_cpd.plt.figure(figsize=(12, 3)) | ||
libs_cpd.plt.plot(list(df.CPs), label='original CPs') | ||
libs_cpd.plt.plot(cps_list_kalman, label='predicted CPs') | ||
libs_cpd.plt.legend(loc="center right", bbox_to_anchor=(1.18, 0.5)) | ||
libs_cpd.plt.savefig('predicted_' + name + '.jpg') | ||
|
||
with open('./predicted_'+name+'.txt','w') as out: | ||
for key,val in report.items(): | ||
out.write('{}:{}\n'.format(key,val)) |
Empty file.
Empty file.
Oops, something went wrong.