Skip to content

Commit

Permalink
setuptools files for distribution.
Browse files Browse the repository at this point in the history
  • Loading branch information
miltondp committed Nov 1, 2018
1 parent 8c2b0ec commit 1307a40
Show file tree
Hide file tree
Showing 6 changed files with 132 additions and 4 deletions.
3 changes: 3 additions & 0 deletions .hgignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,6 @@ __pycache__/
.idea/
experiments/results_*/
experiments/tomato/results/
build/
clustermatch.egg-info/
dist/
2 changes: 2 additions & 0 deletions clustermatch/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
__version__ = '0.1.4a1'
__short_description__ = 'Efficient clustering method for processing highly diverse data'
78 changes: 78 additions & 0 deletions clustermatch/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import argparse
import logging

from clustermatch import __short_description__
from clustermatch.cluster import calculate_simmatrix, get_partition_spectral, get_pval_matrix_by_partition
from utils.data import merge_sources
from utils.output import save_partitions_simple

LOG_FORMAT = "[%(asctime)s] %(levelname)s: %(message)s"
logging.basicConfig(format=LOG_FORMAT, level=logging.INFO)
logger = logging.getLogger('root')


def run():
parser = argparse.ArgumentParser(description=__short_description__)

# Mandatory parameters
parser.add_argument('-i', '--input-files', required=True, type=str, nargs='+', help=
'Path to input data files (could be one or multiple files). It could be a csv, xls (with different worksheets) or zip file.'
)
parser.add_argument('-k', '--n-clusters', required=True, type=int, nargs='+', help=
'Number of final clusters (could contain multiple values).'
)
parser.add_argument('-o', '--output-file', required=True, type=str, help=
'Path to output data partition file. The extension'
)

# Optional parameters
parser.add_argument('--n-init', type=int, default=10, help=
'Number of time the k-means algorithm will be run with different centroid seeds. '
'The final results will be the best output of n_init consecutive runs in terms of inertia.'
)
parser.add_argument('--n-jobs', type=int, default=1, help=
'The number of parallel jobs to run. -1 means using all processors.'
)
parser.add_argument('--minimum-objects', type=int, default=5, help=
'Minimum amount of objects shared between two features to process them.'
)

args = parser.parse_args()

# Validate parameters
if any(x < 2 for x in args.n_clusters):
parser.error('Number of final clusters must be >= 2')

# Read data files
logger.info('Reading input data files')
merged_sources, feature_names, sources_names = merge_sources(args.input_files)

# Run clustermatch
logger.info(f'Getting similarity matrix for {merged_sources.shape[0]} variables')
cm_sim_matrix = calculate_simmatrix(merged_sources, min_n_common_features=args.minimum_objects, n_jobs=args.n_jobs)

logger.info(f'Running spectral clustering with k={args.n_clusters}')
partition = get_partition_spectral(cm_sim_matrix, args.n_clusters, n_init=args.n_init, n_jobs=args.n_jobs)

# if args.compute_pvalues:
# print('Getting pvalue matrix')
# cm_pvalue_sim_matrix = get_pval_matrix_by_partition(
# merged_sources, partition,
# k_internal, min_n_tomatoes,
# args.compute_pvalues_n_perms,
# n_jobs
# )
#
# save_excel(cm_pvalue_sim_matrix, 'cm_pvalue', timestamp=timestamp)
# print('cm_pvalue saved')

columns_order = ['k={0}'.format(str(k)) for k in args.n_clusters]

logger.info(f'Saving partition to {args.output_file}')
save_partitions_simple(
partition,
args.output_file,
extra_columns={'sources': sources_names},
columns_order=['sources', *columns_order],
sort_by_columns=columns_order
)
41 changes: 41 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import setuptools
from clustermatch import __version__, __short_description__

with open("README.md", "r") as fh:
long_description = fh.read()

setuptools.setup(
name="clustermatch",
version=__version__,
author="Milton Pividori",
author_email="miltondp@uchicago.edu",
description=__short_description__,
license='MIT',
long_description=long_description,
long_description_content_type="text/markdown",
url="https://github.com/sinc-lab/clustermatch",
packages=['clustermatch', 'utils'],
python_requires='>=3',
install_requires=[
'numpy',
'scipy',
'pandas',
'joblib',
'scikit-learn',
'xlrd',
'xlwt',
'openpyxl',
],
classifiers=[
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
"Development Status :: 5 - Production/Stable",
"Environment :: Console"
],
entry_points={
'console_scripts': [
'clustermatch = clustermatch.main:run'
]
},
)
2 changes: 1 addition & 1 deletion utils/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ def merge_sources(source_files, rep_merge=np.mean):
sources_names = [sn for sn_idx, sn in enumerate(sources_names)
for i in range(processed_sources[sn_idx].shape[0])]

full_sources = pd.concat(processed_sources)
full_sources = pd.concat(processed_sources, sort=True)

# renamed duplicated
if not full_sources.index.is_unique:
Expand Down
10 changes: 7 additions & 3 deletions utils/output.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,8 +163,7 @@ def save_excel(dataframe, filename, timestamp):
dataframe.to_excel(filepath, encoding='utf-8')


@setup_results_dir
def save_partitions(partitions, timestamp, extra_columns=None, columns_order=None, sort_by_columns=None):
def save_partitions_simple(partitions, partitions_path, extra_columns=None, columns_order=None, sort_by_columns=None):
if extra_columns is not None:
extra_df = pd.DataFrame(extra_columns, index=partitions.index)
partitions = pd.concat([partitions, extra_df], axis=1)
Expand All @@ -175,9 +174,14 @@ def save_partitions(partitions, timestamp, extra_columns=None, columns_order=Non
if sort_by_columns is not None:
partitions = partitions.sort_values(sort_by_columns)

partitions_path = os.path.join(RESULTS_DIR, timestamp, 'partitions' + '.xls')
partitions.to_excel(partitions_path, encoding='utf-8')


@setup_results_dir
def save_partitions(partitions, timestamp, **kwargs):
partitions_path = os.path.join(RESULTS_DIR, timestamp, 'partitions' + '.xls')
save_partitions_simple(partitions, partitions_path, **kwargs)

return partitions_path


Expand Down

0 comments on commit 1307a40

Please sign in to comment.