Skip to content

Commit

Permalink
add setup py and LICENSE
Browse files Browse the repository at this point in the history
  • Loading branch information
GishB committed Mar 20, 2024
1 parent f2d4952 commit 20d9e9f
Show file tree
Hide file tree
Showing 24 changed files with 1,573 additions and 13 deletions.
58 changes: 58 additions & 0 deletions build/lib/data/CloudData.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import pandas as pd
from io import StringIO
import requests


class DrillingData:
""" This class helps to load las files for one of selected horizontal well.
Notes:
- If you define name which does not exist then program will raise NameError!
- All data are restored on public yandex cloud server.
- If you need to load just raw data then use load_raw_data method.
Attributes:
dataset_name: indicate which well you need to load data.
"""

def __init__(self,
dataset_name: str = "default",
sep: str = ","):
self.url_dict = {
"229G": "https://storage.yandexcloud.net/cloud-files-public/229G_las_files.csv",
"231G": "https://storage.yandexcloud.net/cloud-files-public/231G_las_files.csv",
"237G": "https://storage.yandexcloud.net/cloud-files-public/237G_las_files.csv",
"xxxAA564G": "https://storage.yandexcloud.net/cloud-files-public/dataframe.csv",
"xxxAA684G": "https://storage.yandexcloud.net/cloud-files-public/dataframe.csv"
}
self.dataset_name = dataset_name
self.sep = sep
if dataset_name not in ["default", "229G", "231G", "237G", "xxxAA684G", "xxxAA564G"]:
raise NameError("There is not such dataset name.")
if dataset_name in ["xxxAA684G", "xxxAA564G"]:
self.sep = "|"

def load_raw_data(self, url: str) -> pd.DataFrame:
""" Load las files as it is in pandas format.
Warning:
- These files are available only for education purposes and shall not be used for any other points.
Notes:
- value like -9999 means that data has been missing.
- unitless column means type of layer.
- uR/h the most important drilling data columns for analysis.
Returns:
pandas dataframe with all available columns and rows from chosen las file.
"""
return pd.read_csv(StringIO(requests.get(url).content.decode('utf-8')), sep=self.sep)

def get(self) -> pd.DataFrame:
if self.dataset_name == "default":
raw_data = self.load_raw_data(url=self.url_dict.get("237G"))
else:
raw_data = self.load_raw_data(url=self.url_dict.get(self.dataset_name))
if self.dataset_name in ["xxxAA684G", "xxxAA564G"]:
raw_data = raw_data[raw_data[raw_data.columns[0]] == self.dataset_name]
return raw_data.reset_index(drop=True)
72 changes: 72 additions & 0 deletions build/lib/data/DataTransformers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import pandas as pd
import numpy as np


class CloudDataTransformer:
""" This class helps to transform raw data as it expected for experiments in the project.
Attributes:
df: dataframe which was loaded from the cloud.
dataset_name: dataset name which was loaded via CloudData
"""

def __init__(self, df: pd.DataFrame = None,
dataset_name: str = "default"):
self.df = df
self.dataset_name = dataset_name

if self.df is None:
raise ValueError("dataset has not been defined!")

if self.dataset_name not in ["default", "229G", "231G", "237G", "xxxAA684G", "xxxAA564G"]:
raise NameError("There is not such dataset name.")

@staticmethod
def add_time_column(df: pd.DataFrame) -> pd.DataFrame:
df['time'] = np.arange(0, df.shape[0] * 1, 1).astype('datetime64[s]')
df = df.set_index('time')
return df

@staticmethod
def replace_nan_values(df: pd.DataFrame) -> pd.DataFrame:
return df.replace(['-9999', -9999, 'missing', '#'], np.nan)

def drop_nan_values(self, df: pd.DataFrame) -> pd.DataFrame:
df = self.replace_nan_values(df)
df = df[df['unitless'].notna()]
df = df[df['uR/h'].notna()]
return df

@staticmethod
def add_expected_change_points(df: pd.DataFrame) -> pd.DataFrame:
cps_list = [1 if df['unitless'].iloc[i] != df['unitless'].iloc[i + 1] else 0 for i in range(df.shape[0] - 1)]
df['CPs'] = cps_list + [0]
return df

@staticmethod
def take_expected_columns(df: pd.DataFrame) -> pd.DataFrame:
return df[["uR/h", "ohmm", "ohmm.6", "m/hr", "unitless", "CPs"]].reset_index(drop=True)

@staticmethod
def rename_column_special(df: pd.DataFrame) -> pd.DataFrame:
return df.rename(columns={"uR/h": "GR",
"ohmm": "Resist_short",
"ohmm.6": "Resist_long",
"unitless": "LITHOLOGY",
"m/hr": "DrillingSpeed"})

def transform(self) -> pd.DataFrame:
""" Transform data initial point.
Returns:
DataFrame as it expected to be for any future tasks.
"""
df = self.df
if self.dataset_name in ["xxxAA684G", "xxxAA564G"]:
df = self.replace_nan_values(df)
df = self.add_expected_change_points(df)
df = self.take_expected_columns(df)
df = self.rename_column_special(df)
df = self.add_time_column(df)
return df
173 changes: 173 additions & 0 deletions build/lib/data/SythData.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
import numpy as np
import pandas as pd


class SythDataConstructor:
""" This is fundament class which should be used for any syth data generators.
Notes:
length_data % cpu_numbers has to be equal 0!
Attributes:
frequency: which freq should be used seconds, minutes, days.
length_data: just how many points should be generated.
cps_number: number of change points over generated data.
"""

def __init__(self,
white_noise_level: str = "default",
frequency: str = "s",
length_data: int = 24 * 7 * 15 + 15,
cps_number: int = 15):
self.frequency = frequency
self.length_data = length_data
self.cps_number = cps_number

self.white_mean = 0
if white_noise_level == "default":
self.white_std = 0.5
elif white_noise_level == "max":
self.white_std = 1
elif white_noise_level == "min":
self.white_std = 0.01
else:
raise NameError("Not implemented white noise level!")

if length_data % cps_number != 0:
raise ValueError("Not equal length of data and cpu_numbers expected from syth data!")

def generate_empty_df(self) -> pd.DataFrame:
""" Generate dataframe with timestamps.
Returns:
pandas dataframe with expected frequency and length
"""
return pd.DataFrame(index=pd.date_range(start="10/07/1999",
periods=self.length_data,
freq=self.frequency,
normalize=True,
inclusive="both",
name="time"))

def generate_white_noise(self) -> np.array:
""" Generate random noise for your data.
Returns:
array of white noise based on expected length of data.
"""
return np.random.normal(self.white_mean,
self.white_std,
size=self.length_data)

def generate_array_of_change_points(self) -> np.array:
""" Generate values which represent CPs over syth data.
Returns:
numpy array of int values where 1 is change point and 0 is default value.
"""
cps_index = [i for i in range(self.length_data // self.cps_number,
self.length_data,
self.length_data // self.cps_number)]
dp = [0 if i not in cps_index else 1 for i in range(self.length_data)]
return np.array(dp)

def generate_data(self) -> np.array:
""" Generate syth data array
Returns:
expected syth data based on class idea.
"""
...

def get(self) -> pd.DataFrame:
""" Get syth data.
Returns:
pandas dataframe with syth data and time index.
"""
...


class LinearSteps(SythDataConstructor):
def get_linear_array(self,
beta_past: float,
k_past: float,
beta_mutation_coeff: float,
k_mutation_coeff: float) -> tuple[np.array, float, float]:
""" Generate random linear array based on past observation
Notes:
beta_mutation_coeff as well as k_mutation_coeff should be defined based on expertise. These coefficients
help to connect nearest arrays.
Args:
beta_past: beta value in the past array.
k_past: k coefficient in the past array.
beta_mutation_coeff: treshold for beta deviation.
k_mutation_coeff: treshold for k coeff deviation.
Returns:
tuple of generated data and info for this generations beta and k_coeff.
"""
beta = np.random.uniform(beta_past, 1)
k_coeff = np.random.uniform(k_past, 1)
if np.random.uniform(0, 1) > beta_mutation_coeff:
beta = np.random.uniform(-1, 1)
if np.random.uniform(0, 1) > k_mutation_coeff:
k_coeff = np.random.uniform(-1, 1)
dp = [k_coeff * x + beta for x in range(0, self.length_data // self.cps_number)]
return np.array(dp), beta, k_coeff

def generate_data(self, initial_beta: float = -0.01,
initial_k: float = 0.2,
beta_mutation_coeff: float = 0.8,
k_mutation_coeff: float = 0.2) -> np.array:
dp = []
for steps in range(self.cps_number):
temp_info = self.get_linear_array(initial_beta,
initial_k,
beta_mutation_coeff,
k_mutation_coeff)
dp.extend(temp_info[0])
initial_beta = temp_info[1]
initial_k = temp_info[2]
return np.array(dp)

def get(self):
df = self.generate_empty_df()
df['x'] = np.add(self.generate_data(), self.generate_white_noise())
df['CPs'] = self.generate_array_of_change_points()
return df


class SinusoidWaves(SythDataConstructor):
def get_sinusoid_array(self, beta_past: float, beta_mutation_coeff: float) -> tuple[np.array, float]:
""" Generate sinusoid waves over expected shape.
Args:
beta_past: beta coefficient for sinus wave.
beta_mutation_coeff: coeff for mutation operator.
Returns:
array of sinusoid data
"""
beta_past = np.random.uniform(low=beta_past, high=2)
if np.random.uniform(low=0, high=1) > beta_mutation_coeff:
beta_past = np.random.uniform(low=-2, high=2)
x = np.linspace(start=0, stop= self.length_data // self.cps_number, num=self.length_data // self.cps_number)
return np.sin(x) * beta_past, beta_past

def generate_data(self, initial_beta: float = 0.5, beta_mutation_coeff: float = 0.5) -> np.array:
dp = []
for steps in range(self.cps_number):
temp_info = self.get_sinusoid_array(initial_beta,
beta_mutation_coeff)
dp.extend(temp_info[0])
initial_beta = temp_info[1]
return np.array(dp)

def get(self):
df = self.generate_empty_df()
df['x'] = np.add(self.generate_data(), self.generate_white_noise())
df['CPs'] = self.generate_array_of_change_points()
return df
Empty file added build/lib/data/__init__.py
Empty file.
38 changes: 38 additions & 0 deletions build/lib/examples/DefaultStreamlitApp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import streamlit as st
import sys

sys.path.append("..")

import utils.StreamlitFunctions as useful

st.title('Change Point Detection examples.')

st.sidebar.header('UI Model pipeline')
option_model = st.sidebar.selectbox(
'Select CPD model',
("Singular Sequence Decomposition", "Kalman Filter"))
model_params = useful.init_model_params(model_name=option_model)

option_data = st.sidebar.selectbox(
'Select dataset',
("None", "Syth-Steps", "Syth-Sinusoid"))

df = None
if option_data != "None":
data_loader_params = useful.init_data_loader_params()
df = useful.data_loader(option=option_data, params=data_loader_params)
useful.data_info(df)
useful.data_plot(df)

option_start_model = st.sidebar.selectbox(
'Run selected model',
("None", "RUN!"))

df_updated = None
if option_start_model != "None" and df is not None:
df_updated = useful.init_model_pipeline(name_model=option_model, params=model_params, df=df)

if df_updated is not None:
summary_df = useful.model_summary(df=df_updated)
useful.data_info(summary_df)
useful.plot_results(df_updated)
Empty file added build/lib/examples/__init__.py
Empty file.
Empty file.
Empty file.
45 changes: 45 additions & 0 deletions build/lib/experiments/OnlineCPD_experiments/kalman_runner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import sys
import os
sys.path.append(os.path.abspath("../.."))

from utils import libs_cpd, Reports as crtest
import data.CloudData as dtest
import models.ProbabilityBased as kalman
import utils.GeneralFunctions as optf


if len(sys.argv) > 1:
i = dtest.list_links[int(sys.argv[1])];
name = str(sys.argv[2]);
else:
i = dtest.list_links[0];
name = "test";

df = dtest.df_expirement(i)

window_length_savgol = libs_cpd.WindowSizeSelection(time_series = list(df.GR),
wss_algorithm = 'summary_statistics_subsequence').get_window_size()[0]
norm_filter_gr = optf.normalization_linear(optf.filter_Savgol(df.Resist_short, window_length_savgol))
window_length = libs_cpd.WindowSizeSelection(time_series = norm_filter_gr,
wss_algorithm = 'dominant_fourier_frequency', window_max=1000, window_min=50).get_window_size()[0]

cps_list_kalman = kalman.online_detection(list(df['GR']), window=window_length, queue_window=10, treshold_coef=4.3)
df['cps_kalman'] = cps_list_kalman

tsad_average_results = crtest.tsad_average(df.cps_kalman, df.CPs)
tsad_nab_results = crtest.tsad_nab(df.cps_kalman, df.CPs)
tsad_nab_results.update(tsad_average_results)

report = crtest.create_report(tsad_nab_results)


#downloand report and image with predicted labels
libs_cpd.plt.figure(figsize=(12, 3))
libs_cpd.plt.plot(list(df.CPs), label='original CPs')
libs_cpd.plt.plot(cps_list_kalman, label='predicted CPs')
libs_cpd.plt.legend(loc="center right", bbox_to_anchor=(1.18, 0.5))
libs_cpd.plt.savefig('predicted_' + name + '.jpg')

with open('./predicted_'+name+'.txt','w') as out:
for key,val in report.items():
out.write('{}:{}\n'.format(key,val))
Empty file.
Empty file.
Loading

0 comments on commit 20d9e9f

Please sign in to comment.