-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
6 changed files
with
132 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,3 +5,6 @@ __pycache__/ | |
.idea/ | ||
experiments/results_*/ | ||
experiments/tomato/results/ | ||
build/ | ||
clustermatch.egg-info/ | ||
dist/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
__version__ = '0.1.4a1' | ||
__short_description__ = 'Efficient clustering method for processing highly diverse data' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
import argparse | ||
import logging | ||
|
||
from clustermatch import __short_description__ | ||
from clustermatch.cluster import calculate_simmatrix, get_partition_spectral, get_pval_matrix_by_partition | ||
from utils.data import merge_sources | ||
from utils.output import save_partitions_simple | ||
|
||
LOG_FORMAT = "[%(asctime)s] %(levelname)s: %(message)s" | ||
logging.basicConfig(format=LOG_FORMAT, level=logging.INFO) | ||
logger = logging.getLogger('root') | ||
|
||
|
||
def run(): | ||
parser = argparse.ArgumentParser(description=__short_description__) | ||
|
||
# Mandatory parameters | ||
parser.add_argument('-i', '--input-files', required=True, type=str, nargs='+', help= | ||
'Path to input data files (could be one or multiple files). It could be a csv, xls (with different worksheets) or zip file.' | ||
) | ||
parser.add_argument('-k', '--n-clusters', required=True, type=int, nargs='+', help= | ||
'Number of final clusters (could contain multiple values).' | ||
) | ||
parser.add_argument('-o', '--output-file', required=True, type=str, help= | ||
'Path to output data partition file. The extension' | ||
) | ||
|
||
# Optional parameters | ||
parser.add_argument('--n-init', type=int, default=10, help= | ||
'Number of time the k-means algorithm will be run with different centroid seeds. ' | ||
'The final results will be the best output of n_init consecutive runs in terms of inertia.' | ||
) | ||
parser.add_argument('--n-jobs', type=int, default=1, help= | ||
'The number of parallel jobs to run. -1 means using all processors.' | ||
) | ||
parser.add_argument('--minimum-objects', type=int, default=5, help= | ||
'Minimum amount of objects shared between two features to process them.' | ||
) | ||
|
||
args = parser.parse_args() | ||
|
||
# Validate parameters | ||
if any(x < 2 for x in args.n_clusters): | ||
parser.error('Number of final clusters must be >= 2') | ||
|
||
# Read data files | ||
logger.info('Reading input data files') | ||
merged_sources, feature_names, sources_names = merge_sources(args.input_files) | ||
|
||
# Run clustermatch | ||
logger.info(f'Getting similarity matrix for {merged_sources.shape[0]} variables') | ||
cm_sim_matrix = calculate_simmatrix(merged_sources, min_n_common_features=args.minimum_objects, n_jobs=args.n_jobs) | ||
|
||
logger.info(f'Running spectral clustering with k={args.n_clusters}') | ||
partition = get_partition_spectral(cm_sim_matrix, args.n_clusters, n_init=args.n_init, n_jobs=args.n_jobs) | ||
|
||
# if args.compute_pvalues: | ||
# print('Getting pvalue matrix') | ||
# cm_pvalue_sim_matrix = get_pval_matrix_by_partition( | ||
# merged_sources, partition, | ||
# k_internal, min_n_tomatoes, | ||
# args.compute_pvalues_n_perms, | ||
# n_jobs | ||
# ) | ||
# | ||
# save_excel(cm_pvalue_sim_matrix, 'cm_pvalue', timestamp=timestamp) | ||
# print('cm_pvalue saved') | ||
|
||
columns_order = ['k={0}'.format(str(k)) for k in args.n_clusters] | ||
|
||
logger.info(f'Saving partition to {args.output_file}') | ||
save_partitions_simple( | ||
partition, | ||
args.output_file, | ||
extra_columns={'sources': sources_names}, | ||
columns_order=['sources', *columns_order], | ||
sort_by_columns=columns_order | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
import setuptools | ||
from clustermatch import __version__, __short_description__ | ||
|
||
with open("README.md", "r") as fh: | ||
long_description = fh.read() | ||
|
||
setuptools.setup( | ||
name="clustermatch", | ||
version=__version__, | ||
author="Milton Pividori", | ||
author_email="miltondp@uchicago.edu", | ||
description=__short_description__, | ||
license='MIT', | ||
long_description=long_description, | ||
long_description_content_type="text/markdown", | ||
url="https://github.com/sinc-lab/clustermatch", | ||
packages=['clustermatch', 'utils'], | ||
python_requires='>=3', | ||
install_requires=[ | ||
'numpy', | ||
'scipy', | ||
'pandas', | ||
'joblib', | ||
'scikit-learn', | ||
'xlrd', | ||
'xlwt', | ||
'openpyxl', | ||
], | ||
classifiers=[ | ||
"Programming Language :: Python :: 3", | ||
"License :: OSI Approved :: MIT License", | ||
"Operating System :: OS Independent", | ||
"Development Status :: 5 - Production/Stable", | ||
"Environment :: Console" | ||
], | ||
entry_points={ | ||
'console_scripts': [ | ||
'clustermatch = clustermatch.main:run' | ||
] | ||
}, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters