From 5262774320eb4d30ee49b9e41c4668cbb03db16c Mon Sep 17 00:00:00 2001 From: Shaurita Hutchins Date: Thu, 24 Feb 2022 17:05:13 -0600 Subject: [PATCH] Improve documentation (#11) * Add website and documentation links to setup.py * Add documentation to call_htseq function. * Refactored function argument. * Added docstring to csvtolist utils function. * Added docstrings. --- HTSeqCountCluster/htseq_count_cluster.py | 47 ++++++++++++++++++++---- HTSeqCountCluster/mergecounts.py | 21 +++++++---- HTSeqCountCluster/pbsjob/pbsconfig.py | 38 +++++++++---------- HTSeqCountCluster/pbsjob/pbsjob.py | 22 +++++++---- HTSeqCountCluster/pbsjob/pbsutils.py | 30 +++++++++++---- HTSeqCountCluster/pbsjob/qstat.py | 27 ++++++++++---- HTSeqCountCluster/utils/__init__.py | 13 ++++++- setup.py | 8 ++-- 8 files changed, 141 insertions(+), 65 deletions(-) diff --git a/HTSeqCountCluster/htseq_count_cluster.py b/HTSeqCountCluster/htseq_count_cluster.py index a7fd6a1..45726e2 100644 --- a/HTSeqCountCluster/htseq_count_cluster.py +++ b/HTSeqCountCluster/htseq_count_cluster.py @@ -13,13 +13,34 @@ def call_htseq(infile, gtf, outfile): - """Call the htseq-count script.""" - cmd = 'htseq-count -f bam -s no {} {} -o {}_htseq.out'.format(infile, gtf, outfile) + """Call the htseq-count script. + + :param infile: An alignment file of aligned reads in SAM format. + :type infile: str + :param gtf: The gtf (Gene transfer format) file. + :type gtf: str + :param outfile: The name of the output SAM alignment file. + :type outfile: str + """ + cmd = 'htseq-count -f bam -s no {} {} -o {}_htseq.out'.format( + infile, gtf, outfile) return cmd def htseq_jobber(input_path, inputlist, gtf, outpath, email): - """Create multiple pbs jobs based on input list of files.""" + """Create multiple pbs jobs based on input list of files. + + :param input_path: [description] + :type input_path: [type] + :param inputlist: [description] + :type inputlist: [type] + :param gtf: The gtf (Gene transfer format) file. + :type gtf: str + :param outpath: [description] + :type outpath: [type] + :param email: An email address to send notifications. + :type email: str + """ jobids = [] for item in inputlist: htseqjob = PBSJob(email_address=email, base_jobname=item) @@ -33,7 +54,13 @@ def htseq_jobber(input_path, inputlist, gtf, outpath, email): def check_job_status(job_id, email=True): - """Use Qstat to monitor your job status.""" + """Use Qstat to monitor your job status. + + :param job_id: The job's id. + :type job_id: str + :param email: A flag to decide whether to send email, defaults to True + :type email: bool, optional + """ # TODO Allow either slack notifications or email or text. qwatch = Qstat().watch(job_id) if qwatch == 'Job id not found.': @@ -51,13 +78,17 @@ def main(): description=textwrap.dedent('''\ This is a command line wrapper around htseq-count. ''')) - parser.add_argument('-p', '--inpath', help='Path of your samples/sample folders.', required=True) - parser.add_argument('-f', '--infile', help='Name or path to your input csv file.', required=True) - parser.add_argument('-g', '--gtf', help='Name or path to your gtf/gff file.', required=True) + parser.add_argument('-p', '--inpath', help='Path of your samples/sample folders.', + required=True) + parser.add_argument('-f', '--infile', help='Name or path to your input csv file.', + required=True) + parser.add_argument('-g', '--gtf', help='Name or path to your gtf/gff file.', + required=True) parser.add_argument('-o', '--outpath', help='Directory of your output counts file. The counts file will be named.', required=True) - parser.add_argument('-e', '--email', help='Email address to send script completion to.') + parser.add_argument('-e', '--email', + help='Email address to send script completion to.') args = parser.parse_args() diff --git a/HTSeqCountCluster/mergecounts.py b/HTSeqCountCluster/mergecounts.py index d8db129..4cc7cba 100644 --- a/HTSeqCountCluster/mergecounts.py +++ b/HTSeqCountCluster/mergecounts.py @@ -1,28 +1,33 @@ # -*- coding: utf-8 -*- -import pandas as pd import os import argparse import textwrap + +import pandas as pd + from HTSeqCountCluster.logger import Logger # Create a merge-counts logger mc_log = Logger().default(logname="merge-counts", logfile=None) -def merge_counts_tables(filesdirectory): +def merge_counts_tables(files_dir): """Merge multiple counts tables into 1 counts table. After running htseq-count-cluster, there will be a counts table for each sample in the output directory. This function will use the genes column as the first column and then insert each subsequent sample name as column header with counts data as the column rows. + + :param files_dir: The directory of the individual counts files. + :type files_dir: str """ mc_log.info("Running merge-counts script.") - if filesdirectory is ".": - filesdirectory = os.getcwd() + if files_dir is ".": + files_dir = os.getcwd() - mc_log.info("Your directory location is: %s" % filesdirectory) - files = os.listdir(filesdirectory) + mc_log.info("Your directory location is: %s" % files_dir) + files = os.listdir(files_dir) samplenames = [] sample_dfs = [] @@ -32,7 +37,7 @@ def merge_counts_tables(filesdirectory): if ext == 'out': samplename, barcode = filename.split('-') samplenames.append(samplename) - filep = os.path.join(filesdirectory, file) + filep = os.path.join(files_dir, file) data = pd.read_table(filep, header=None, names=['Genes', samplename]) mc_log.info("A dataframe has been created for %s." % samplename) @@ -67,7 +72,7 @@ def main(): type=str) args = parser.parse_args() - merge_counts_tables(filesdirectory=args.directory) + merge_counts_tables(files_dir=args.directory) if __name__ == '__main__': diff --git a/HTSeqCountCluster/pbsjob/pbsconfig.py b/HTSeqCountCluster/pbsjob/pbsconfig.py index 713421f..97bb333 100644 --- a/HTSeqCountCluster/pbsjob/pbsconfig.py +++ b/HTSeqCountCluster/pbsjob/pbsconfig.py @@ -4,7 +4,7 @@ import sys import os -from HTSeqCountCluster.pbsjob.pbsutils import randomid +from HTSeqCountCluster.pbsjob.pbsutils import random_id if sys.version_info.major < 3: raise NotImplementedError('This is not designed for the python version in your \ @@ -15,23 +15,23 @@ _format1 = '%a %b %d %I:%M:%S %p %Y' -_jobname = 'htseq_{}'.format(randomid(length=4)) +_jobname = 'htseq_{}'.format(random_id(length=4)) __DEFAULT__ = { - 'author': getpass.getuser(), - 'description': 'This is a default pbs job.', - 'date': d.now().strftime(_format1), - 'proj_name': 'htseq-cluster', - 'select': '1', - 'memgb': '2gb', - 'cput': '24:00:00', - 'wt': '12:00:00', - 'job_name': _jobname, - 'outfile': _jobname + '.o', - 'errfile': _jobname + '.e', - 'script': _jobname, - 'log_name': _jobname, - 'pbsworkdir': os.getcwd(), - 'cmd': 'python3 ' + os.path.join(os.getcwd(), _jobname + '.py'), - 'email': 'n/a' - } + 'author': getpass.getuser(), + 'description': 'This is a default pbs job.', + 'date': d.now().strftime(_format1), + 'proj_name': 'htseq-cluster', + 'select': '1', + 'memgb': '2gb', + 'cput': '24:00:00', + 'wt': '12:00:00', + 'job_name': _jobname, + 'outfile': _jobname + '.o', + 'errfile': _jobname + '.e', + 'script': _jobname, + 'log_name': _jobname, + 'pbsworkdir': os.getcwd(), + 'cmd': 'python3 ' + os.path.join(os.getcwd(), _jobname + '.py'), + 'email': 'n/a' +} diff --git a/HTSeqCountCluster/pbsjob/pbsjob.py b/HTSeqCountCluster/pbsjob/pbsjob.py index d0e754c..1748a45 100644 --- a/HTSeqCountCluster/pbsjob/pbsjob.py +++ b/HTSeqCountCluster/pbsjob/pbsjob.py @@ -3,8 +3,8 @@ from pkg_resources import resource_filename from HTSeqCountCluster.logger import Logger -from HTSeqCountCluster.pbsjob.pbsutils import (basejobids, writecodefile, - import_temp, file2str) +from HTSeqCountCluster.pbsjob.pbsutils import (basejobids, write_code_file, + import_temp, file_to_str) from HTSeqCountCluster.pbsjob.pbsconfig import __DEFAULT__ from HTSeqCountCluster import pbsjob from HTSeqCountCluster.pbsjob.qstat import Qstat @@ -12,6 +12,7 @@ class BasePBSJob(object): """Base class for simple jobs.""" + def __init__(self, base_jobname): """Initialize job attributes.""" self.default_job_attributes = __DEFAULT__ @@ -43,6 +44,7 @@ def _cleanup(self, jobname): class PBSJob(BasePBSJob): """Create a qsub/pbs job & script for the job to execute.""" + def __init__(self, email_address, base_jobname=None): super().__init__(base_jobname=base_jobname) self.email = email_address @@ -82,10 +84,13 @@ def submit_code(self, code, cleanup=True, default=True): code_str = code if default: - self.sgejob_log.info('You are running a job with default attributes.') - writecodefile(filename=self.jobname, code=code_str, language='python') + self.sgejob_log.info( + 'You are running a job with default attributes.') + writecodefile(filename=self.jobname, + code=code_str, language='python') pyfilename = self.jobname + '.py' - self.sgejob_log.info('%s python file has been created.' % pyfilename) + self.sgejob_log.info( + '%s python file has been created.' % pyfilename) # Create the pbs script from the template or dict pbstemp = import_temp(self.temp_pbs) @@ -104,7 +109,8 @@ def submit_code(self, code, cleanup=True, default=True): try: cmd = ['qsub ' + self.jobname + '.pbs'] # this is the command # Shell MUST be True - cmd_status = run(cmd, stdout=PIPE, stderr=PIPE, shell=True, check=True) + cmd_status = run(cmd, stdout=PIPE, stderr=PIPE, + shell=True, check=True) except CalledProcessError as err: self.sgejob_log.error(err.stderr.decode('utf-8')) if cleanup: @@ -144,7 +150,8 @@ def submit_cmd(self, cmd, cleanup=True): try: cmd = ['qsub ' + self.jobname + '.pbs'] # this is the command # Shell MUST be True - cmd_status = run(cmd, stdout=PIPE, stderr=PIPE, shell=True, check=True) + cmd_status = run(cmd, stdout=PIPE, stderr=PIPE, + shell=True, check=True) except CalledProcessError as err: self.sgejob_log.error(err.stderr.decode('utf-8')) if cleanup: @@ -162,4 +169,3 @@ def submit_cmd(self, cmd, cleanup=True): else: # Unsuccessful. Stdout will be '1' self.sgejob_log.error('PBS job not submitted.') - diff --git a/HTSeqCountCluster/pbsjob/pbsutils.py b/HTSeqCountCluster/pbsjob/pbsutils.py index f0bf93f..60b102d 100644 --- a/HTSeqCountCluster/pbsjob/pbsutils.py +++ b/HTSeqCountCluster/pbsjob/pbsutils.py @@ -8,16 +8,26 @@ def basejobids(length, name='submit'): - """"Create base job id and name.""" - base_id = randomid(length=length) + """"Create base job id and name. + + :param length: [description] + :type length: [type] + :param name: [description], defaults to 'submit' + :type name: str, optional + :return: [description] + :rtype: [type] + """ + base_id = random_id(length=length) base = name + "_{0}".format(base_id) return base_id, base def import_temp(filepath): - """Import the script or file that you need a template of and that has - temp strings. + """Import a template file that has template strings. + + :param filepath: [description] + :type filepath: [type] """ file_temp = open(filepath, 'r') file_str = file_temp.read() @@ -27,19 +37,23 @@ def import_temp(filepath): return file_temp -def file2str(filepath): - """Turn the contents of a file (python file) into a string.""" +def file_to_str(filepath): + """Turn the contents of a file (python file) into a string. + + :param filepath: [description] + :type filepath: [type] + """ file_temp = open(filepath, 'r') file_str = file_temp.read() return file_str -def randomid(length=5): +def random_id(length=5): """Generate a random ID of 5 characters to append to qsub job name.""" return ''.join(random.sample(string.ascii_letters + string.digits, length)) -def writecodefile(filename, code, language): +def write_code_file(filename, code, language): """Create a python file and write the code to it.""" if language == 'python': with open(filename + '.py', 'w') as pyfile: diff --git a/HTSeqCountCluster/pbsjob/qstat.py b/HTSeqCountCluster/pbsjob/qstat.py index 1f6de9c..f596b93 100644 --- a/HTSeqCountCluster/pbsjob/qstat.py +++ b/HTSeqCountCluster/pbsjob/qstat.py @@ -3,6 +3,8 @@ import getpass import re +from HTSeqCountCluster.logger import Logger + class Qstat(object): def __init__(self): @@ -10,26 +12,34 @@ def __init__(self): _username = getpass.getuser() self.username = _username self.split_regex = re.compile(r'\s+') + self.qstat_log = Logger().default(logname="qstat", logfile=None) def qstatinfo(self, qstat_path='qstat'): - """Retrieve qstat output.""" + """Retrieve qstat output. + + :param qstat_path: [description], defaults to 'qstat' + :type qstat_path: str, optional + """ try: qstatinfo = check_output([qstat_path]) except CalledProcessError as cpe: return_code = 'qstat returncode: %s' % cpe.returncode std_error = 'qstat standard output: %s' % cpe.stderr - print(return_code + '\n' + std_error) + self.qstat_log(return_code + '\n' + std_error) except FileNotFoundError: raise FileNotFoundError('qstat is not on your machine.') + else: + jobs = self._output_parser(qstatinfo) - jobs = self._output_parser(qstatinfo) - - return jobs + return jobs def _output_parser(self, output): """Parse output from qstat pbs commandline program. Returns a list of dictionaries for each job. + + :param output: The qstat output. + :type output: [type] """ lines = output.decode('utf-8').split('\n') del lines[:5] @@ -37,9 +47,10 @@ def _output_parser(self, output): for line in lines: els = self.split_regex.split(line) try: - j = {"job_id": els[0], "name": els[1], "user": els[2], "elapsed_time": els[3], - "status": els[4], "queue": els[5]} - jobs.append(j) + j = {"job_id": els[0], "name": els[1], "user": els[2], + "elapsed_time": els[3], "status": els[4], + "queue": els[5]} + jobs.append(j) except IndexError: pass diff --git a/HTSeqCountCluster/utils/__init__.py b/HTSeqCountCluster/utils/__init__.py index 3029df7..8254d89 100644 --- a/HTSeqCountCluster/utils/__init__.py +++ b/HTSeqCountCluster/utils/__init__.py @@ -2,8 +2,17 @@ import pandas as pd -def csvtolist(csvfile): +def csvtolist(csvfile, column=0): + """Convert a column of a csv file to a list. + + :param csvfile: A comma delimited file. + :type csvfile: str + :param column: The number of the column to convert. + :type column: int + :return: A list + :rtype: list + """ df = pd.read_csv(csvfile, header=None) - output_list = sorted(list(df[0])) + output_list = sorted(list(df[column])) return output_list diff --git a/setup.py b/setup.py index 06c1eb8..5832435 100644 --- a/setup.py +++ b/setup.py @@ -13,12 +13,11 @@ # Set the home path of the setup script/package home = path.abspath(path.dirname(__file__)) name = 'HTSeqCountCluster' -version = '1.3' - +version = '1.4' def readme(): """Get the long description from the README file.""" - with open(path.join(home, 'README.rst'), encoding='utf-8') as f: + with open(path.join(home, 'README.md'), encoding='utf-8') as f: return f.read() @@ -41,7 +40,8 @@ def readme(): 'Programming Language :: Python :: 3.6' ], project_urls={ - 'Documentation': 'https://tinyurl.com/yb7kz7zz', + 'Website': 'https://tinyurl.com/yb7kz7zz', + 'Documentation': 'http://htseq-count-cluster.rtfd.io/', }, # Packages will be automatically found if not in this list. packages=find_packages(),