diff --git a/.hgignore b/.hgignore index f948599..093aca0 100644 --- a/.hgignore +++ b/.hgignore @@ -5,3 +5,6 @@ __pycache__/ .idea/ experiments/results_*/ experiments/tomato/results/ +build/ +clustermatch.egg-info/ +dist/ diff --git a/clustermatch/__init__.py b/clustermatch/__init__.py index e69de29..5d73230 100644 --- a/clustermatch/__init__.py +++ b/clustermatch/__init__.py @@ -0,0 +1,2 @@ +__version__ = '0.1.4a1' +__short_description__ = 'Efficient clustering method for processing highly diverse data' diff --git a/clustermatch/main.py b/clustermatch/main.py new file mode 100644 index 0000000..0cb0299 --- /dev/null +++ b/clustermatch/main.py @@ -0,0 +1,78 @@ +import argparse +import logging + +from clustermatch import __short_description__ +from clustermatch.cluster import calculate_simmatrix, get_partition_spectral, get_pval_matrix_by_partition +from utils.data import merge_sources +from utils.output import save_partitions_simple + +LOG_FORMAT = "[%(asctime)s] %(levelname)s: %(message)s" +logging.basicConfig(format=LOG_FORMAT, level=logging.INFO) +logger = logging.getLogger('root') + + +def run(): + parser = argparse.ArgumentParser(description=__short_description__) + + # Mandatory parameters + parser.add_argument('-i', '--input-files', required=True, type=str, nargs='+', help= + 'Path to input data files (could be one or multiple files). It could be a csv, xls (with different worksheets) or zip file.' + ) + parser.add_argument('-k', '--n-clusters', required=True, type=int, nargs='+', help= + 'Number of final clusters (could contain multiple values).' + ) + parser.add_argument('-o', '--output-file', required=True, type=str, help= + 'Path to output data partition file. The extension' + ) + + # Optional parameters + parser.add_argument('--n-init', type=int, default=10, help= + 'Number of time the k-means algorithm will be run with different centroid seeds. ' + 'The final results will be the best output of n_init consecutive runs in terms of inertia.' + ) + parser.add_argument('--n-jobs', type=int, default=1, help= + 'The number of parallel jobs to run. -1 means using all processors.' + ) + parser.add_argument('--minimum-objects', type=int, default=5, help= + 'Minimum amount of objects shared between two features to process them.' + ) + + args = parser.parse_args() + + # Validate parameters + if any(x < 2 for x in args.n_clusters): + parser.error('Number of final clusters must be >= 2') + + # Read data files + logger.info('Reading input data files') + merged_sources, feature_names, sources_names = merge_sources(args.input_files) + + # Run clustermatch + logger.info(f'Getting similarity matrix for {merged_sources.shape[0]} variables') + cm_sim_matrix = calculate_simmatrix(merged_sources, min_n_common_features=args.minimum_objects, n_jobs=args.n_jobs) + + logger.info(f'Running spectral clustering with k={args.n_clusters}') + partition = get_partition_spectral(cm_sim_matrix, args.n_clusters, n_init=args.n_init, n_jobs=args.n_jobs) + + # if args.compute_pvalues: + # print('Getting pvalue matrix') + # cm_pvalue_sim_matrix = get_pval_matrix_by_partition( + # merged_sources, partition, + # k_internal, min_n_tomatoes, + # args.compute_pvalues_n_perms, + # n_jobs + # ) + # + # save_excel(cm_pvalue_sim_matrix, 'cm_pvalue', timestamp=timestamp) + # print('cm_pvalue saved') + + columns_order = ['k={0}'.format(str(k)) for k in args.n_clusters] + + logger.info(f'Saving partition to {args.output_file}') + save_partitions_simple( + partition, + args.output_file, + extra_columns={'sources': sources_names}, + columns_order=['sources', *columns_order], + sort_by_columns=columns_order + ) diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..60b4f3a --- /dev/null +++ b/setup.py @@ -0,0 +1,41 @@ +import setuptools +from clustermatch import __version__, __short_description__ + +with open("README.md", "r") as fh: + long_description = fh.read() + +setuptools.setup( + name="clustermatch", + version=__version__, + author="Milton Pividori", + author_email="miltondp@uchicago.edu", + description=__short_description__, + license='MIT', + long_description=long_description, + long_description_content_type="text/markdown", + url="https://github.com/sinc-lab/clustermatch", + packages=['clustermatch', 'utils'], + python_requires='>=3', + install_requires=[ + 'numpy', + 'scipy', + 'pandas', + 'joblib', + 'scikit-learn', + 'xlrd', + 'xlwt', + 'openpyxl', + ], + classifiers=[ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + "Development Status :: 5 - Production/Stable", + "Environment :: Console" + ], + entry_points={ + 'console_scripts': [ + 'clustermatch = clustermatch.main:run' + ] + }, +) diff --git a/utils/data.py b/utils/data.py index 8efe6c3..81921d5 100644 --- a/utils/data.py +++ b/utils/data.py @@ -104,7 +104,7 @@ def merge_sources(source_files, rep_merge=np.mean): sources_names = [sn for sn_idx, sn in enumerate(sources_names) for i in range(processed_sources[sn_idx].shape[0])] - full_sources = pd.concat(processed_sources) + full_sources = pd.concat(processed_sources, sort=True) # renamed duplicated if not full_sources.index.is_unique: diff --git a/utils/output.py b/utils/output.py index 102f26b..a33f41d 100644 --- a/utils/output.py +++ b/utils/output.py @@ -163,8 +163,7 @@ def save_excel(dataframe, filename, timestamp): dataframe.to_excel(filepath, encoding='utf-8') -@setup_results_dir -def save_partitions(partitions, timestamp, extra_columns=None, columns_order=None, sort_by_columns=None): +def save_partitions_simple(partitions, partitions_path, extra_columns=None, columns_order=None, sort_by_columns=None): if extra_columns is not None: extra_df = pd.DataFrame(extra_columns, index=partitions.index) partitions = pd.concat([partitions, extra_df], axis=1) @@ -175,9 +174,14 @@ def save_partitions(partitions, timestamp, extra_columns=None, columns_order=Non if sort_by_columns is not None: partitions = partitions.sort_values(sort_by_columns) - partitions_path = os.path.join(RESULTS_DIR, timestamp, 'partitions' + '.xls') partitions.to_excel(partitions_path, encoding='utf-8') + +@setup_results_dir +def save_partitions(partitions, timestamp, **kwargs): + partitions_path = os.path.join(RESULTS_DIR, timestamp, 'partitions' + '.xls') + save_partitions_simple(partitions, partitions_path, **kwargs) + return partitions_path