Skip to content

Commit

Permalink
Adding sampling tests and optimize imports
Browse files Browse the repository at this point in the history
  • Loading branch information
alexmindset committed May 13, 2024
1 parent 3f0df9e commit 1479535
Show file tree
Hide file tree
Showing 2 changed files with 83 additions and 14 deletions.
29 changes: 15 additions & 14 deletions insolver/feature_engineering/sampling.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import pandas as pd
import numpy as np
from numpy import arange, repeat, delete, append
from numpy.random import choice
from pandas import DataFrame, concat


class Sampling:
Expand Down Expand Up @@ -33,7 +34,7 @@ def sample_dataset(self, df):
df (pandas.Dataframe): The dataframe.
Raises:
NotImplementedError: If self.method is not supported.
NotImplementedError: If `self.method` is not supported.
Returns:
New dataset with selected rows.
Expand Down Expand Up @@ -80,7 +81,7 @@ def _systematic_sampling(self, df):
New dataset with selected rows.
"""
# get indexes with selected step
indexes = np.arange(0, len(df), step=self.n)
indexes = arange(0, len(df), step=self.n)
# get only selected indexes
systematic_sample = df.iloc[indexes]
return systematic_sample
Expand All @@ -100,22 +101,22 @@ def _cluster_sampling(self, df):
# count clusters to check
clusters_count = cluster_df['cluster_id'].unique().sum()

cluster_sample = pd.DataFrame()
cluster_sample = DataFrame()

# if the selected number of clusters is bigger then the created number raise error
if self.n > clusters_count:
raise Exception(f'{self.n} cannot be bigger then number of clusters.')
raise ValueError(f'{self.n} cannot be bigger then number of clusters.')

# if the selected number of clusters equals the created number return df
elif self.n == clusters_count:
return df

else:
# randomly chose clusters to keep
clusters_to_keep = np.random.choice(cluster_df['cluster_id'].unique(), self.n)
clusters_to_keep = choice(cluster_df['cluster_id'].unique(), self.n)
for cluster in clusters_to_keep:
# create a new DataFrame only with the selected clusters
cluster_sample = pd.concat([cluster_sample, cluster_df[cluster_df['cluster_id'] == cluster]])
cluster_sample = concat([cluster_sample, cluster_df[cluster_df['cluster_id'] == cluster]])

return cluster_sample

Expand All @@ -132,13 +133,13 @@ def _stratified_sampling(self, df):
# create clusters
cluster_df = self._create_clusters(df)

stratified_sample = pd.DataFrame()
stratified_sample = DataFrame()

for cluster in cluster_df['cluster_id'].unique():
# get selected number of values from each cluster
sample_cluster = cluster_df[cluster_df['cluster_id'] == cluster].sample(n=self.n)
# create a new DataFrame only with the selected values in the cluster
stratified_sample = pd.concat([stratified_sample, sample_cluster])
stratified_sample = concat([stratified_sample, sample_cluster])

return stratified_sample

Expand Down Expand Up @@ -174,22 +175,22 @@ def _create_clusters(self, df):
else:
try:
# try if the clusters can be filled exactly
new_df['cluster_id'] = np.repeat([range(1, self.n_clusters + 1)], cluster_size)
new_df['cluster_id'] = repeat(list(range(1, self.n_clusters + 1)), cluster_size)

except ValueError:
# if not get indexes
indexes = np.repeat([range(1, self.n_clusters + 1)], cluster_size)
indexes = repeat(list(range(1, self.n_clusters + 1)), cluster_size)
# calculate the difference
diff = len(indexes) - len(df)

# if the difference is greater than 0 delete one row
if diff > 0:
for i in range(diff):
new_df['cluster_id'] = np.delete(indexes, len(indexes) - 1)
new_df['cluster_id'] = delete(indexes, len(indexes) - 1)

# if the difference is less than 0 add one row
if diff < 0:
for i in range(abs(diff)):
new_df['cluster_id'] = np.append(indexes, self.n_clusters)
new_df['cluster_id'] = append(indexes, self.n_clusters)

return new_df
68 changes: 68 additions & 0 deletions tests/feature_engineering/test_sampling.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import pytest
import pandas as pd
import numpy as np
from insolver.feature_engineering import Sampling


@pytest.fixture
def sample_data():
return pd.DataFrame({'A': np.arange(100), 'B': np.random.rand(100), 'Cluster': np.repeat(np.arange(10), 10)})


def test_simple_sampling(sample_data):
sampler = Sampling(n=5, method='simple')
sampled_df = sampler.sample_dataset(sample_data)
assert len(sampled_df) == 5


def test_systematic_sampling(sample_data):
sampler = Sampling(n=10, method='systematic')
sampled_df = sampler.sample_dataset(sample_data)
assert len(sampled_df) == 10


def test_cluster_sampling(sample_data):
sampler = Sampling(n=2, method='cluster')
sampled_df = sampler.sample_dataset(sample_data)
assert len(sampled_df) == 20 # Two clusters chosen, each with 10 elements


def test_stratified_sampling(sample_data):
sampler = Sampling(n=2, method='stratified', cluster_column='Cluster')
sampled_df = sampler.sample_dataset(sample_data)
assert len(sampled_df) == 20 # Two elements chosen from each cluster


def test_method_not_supported(sample_data):
sampler = Sampling(n=5, method='unsupported_method')
with pytest.raises(NotImplementedError):
sampler.sample_dataset(sample_data)


def test_cluster_sampling_with_invalid_n(sample_data):
sampler = Sampling(n=150, method='cluster')
with pytest.raises(ValueError):
sampler.sample_dataset(sample_data)
sampler = Sampling(n=55, method='cluster')
sampler.sample_dataset(sample_data)


def test_create_clusters_with_null_values():
with pytest.raises(ValueError):
sampler = Sampling(n=5, method='cluster', cluster_column='Cluster')
sampler._create_clusters(pd.DataFrame({'Cluster': [1, 2, np.nan, 4]}))


def test_create_clusters_with_insufficient_data():
with pytest.raises(ValueError):
sampler = Sampling(n=10, n_clusters=10, method='cluster')
sampler._create_clusters(pd.DataFrame({'A': [1, 2, 3]}))


def test_create_clusters_with_extra_data():
data = {'A': np.arange(7)}
df = pd.DataFrame(data)
sampler = Sampling(n=5, n_clusters=2, method='cluster')
new_df = sampler._create_clusters(df)
assert len(new_df) == 7
assert all(1 <= cluster_id <= 2 for cluster_id in new_df['cluster_id'])

0 comments on commit 1479535

Please sign in to comment.