Improve documentation (#11)

* Add website and documentation links to setup.py * Add documentation to call_htseq function. * Refactored function argument. * Added docstring to csvtolist utils function. * Added docstrings.
datasnakes · Feb 24, 2022 · 5262774 · 5262774
1 parent 6a36a1e
commit 5262774
Show file tree

Hide file tree

Showing 8 changed files with 141 additions and 65 deletions.
diff --git a/HTSeqCountCluster/htseq_count_cluster.py b/HTSeqCountCluster/htseq_count_cluster.py
@@ -13,13 +13,34 @@
 
 
 def call_htseq(infile, gtf, outfile):
-    """Call the htseq-count script."""
-    cmd = 'htseq-count -f bam -s no {} {} -o {}_htseq.out'.format(infile, gtf, outfile)
+    """Call the htseq-count script.
+
+    :param infile: An alignment file of aligned reads in SAM format.
+    :type infile: str
+    :param gtf: The gtf (Gene transfer format) file.
+    :type gtf: str
+    :param outfile: The name of the output SAM alignment file.
+    :type outfile: str
+    """
+    cmd = 'htseq-count -f bam -s no {} {} -o {}_htseq.out'.format(
+        infile, gtf, outfile)
     return cmd
 
 
 def htseq_jobber(input_path, inputlist, gtf, outpath, email):
-    """Create multiple pbs jobs based on input list of files."""
+    """Create multiple pbs jobs based on input list of files.
+
+    :param input_path: [description]
+    :type input_path: [type]
+    :param inputlist: [description]
+    :type inputlist: [type]
+    :param gtf: The gtf (Gene transfer format) file.
+    :type gtf: str
+    :param outpath: [description]
+    :type outpath: [type]
+    :param email: An email address to send notifications.
+    :type email: str
+    """
     jobids = []
     for item in inputlist:
         htseqjob = PBSJob(email_address=email, base_jobname=item)
@@ -33,7 +54,13 @@ def htseq_jobber(input_path, inputlist, gtf, outpath, email):
 
 
 def check_job_status(job_id, email=True):
-    """Use Qstat to monitor your job status."""
+    """Use Qstat to monitor your job status.
+
+    :param job_id: The job's id.
+    :type job_id: str
+    :param email: A flag to decide whether to send email, defaults to True
+    :type email: bool, optional
+    """
     # TODO Allow either slack notifications or email or text.
     qwatch = Qstat().watch(job_id)
     if qwatch == 'Job id not found.':
@@ -51,13 +78,17 @@ def main():
                                      description=textwrap.dedent('''\
                                     This is a command line wrapper around htseq-count.
                                     '''))
-    parser.add_argument('-p', '--inpath', help='Path of your samples/sample folders.', required=True)
-    parser.add_argument('-f', '--infile', help='Name or path to your input csv file.', required=True)
-    parser.add_argument('-g', '--gtf', help='Name or path to your gtf/gff file.', required=True)
+    parser.add_argument('-p', '--inpath', help='Path of your samples/sample folders.',
+                        required=True)
+    parser.add_argument('-f', '--infile', help='Name or path to your input csv file.',
+                        required=True)
+    parser.add_argument('-g', '--gtf', help='Name or path to your gtf/gff file.',
+                        required=True)
     parser.add_argument('-o', '--outpath',
                         help='Directory of your output counts file. The counts file will be named.',
                         required=True)
-    parser.add_argument('-e', '--email', help='Email address to send script completion to.')
+    parser.add_argument('-e', '--email',
+                        help='Email address to send script completion to.')
 
     args = parser.parse_args()
 

diff --git a/HTSeqCountCluster/mergecounts.py b/HTSeqCountCluster/mergecounts.py
@@ -1,28 +1,33 @@
 # -*- coding: utf-8 -*-
-import pandas as pd
 import os
 import argparse
 import textwrap
+
+import pandas as pd
+
 from HTSeqCountCluster.logger import Logger
 
 # Create a merge-counts logger
 mc_log = Logger().default(logname="merge-counts", logfile=None)
 
 
-def merge_counts_tables(filesdirectory):
+def merge_counts_tables(files_dir):
     """Merge multiple counts tables into 1 counts table.
 
     After running htseq-count-cluster, there will be a counts table for each
     sample in the output directory. This function will use the genes column as
     the first column and then insert each subsequent sample name as column
     header with counts data as the column rows.
+
+    :param files_dir: The directory of the individual counts files.
+    :type files_dir: str
     """
     mc_log.info("Running merge-counts script.")
-    if filesdirectory is ".":
-        filesdirectory = os.getcwd()
+    if files_dir is ".":
+        files_dir = os.getcwd()
 
-    mc_log.info("Your directory location is: %s" % filesdirectory)
-    files = os.listdir(filesdirectory)
+    mc_log.info("Your directory location is: %s" % files_dir)
+    files = os.listdir(files_dir)
 
     samplenames = []
     sample_dfs = []
@@ -32,7 +37,7 @@ def merge_counts_tables(filesdirectory):
         if ext == 'out':
             samplename, barcode = filename.split('-')
             samplenames.append(samplename)
-            filep = os.path.join(filesdirectory, file)
+            filep = os.path.join(files_dir, file)
             data = pd.read_table(filep, header=None,
                                  names=['Genes', samplename])
             mc_log.info("A dataframe has been created for %s." % samplename)
@@ -67,7 +72,7 @@ def main():
                         type=str)
     args = parser.parse_args()
 
-    merge_counts_tables(filesdirectory=args.directory)
+    merge_counts_tables(files_dir=args.directory)
 
 
 if __name__ == '__main__':

diff --git a/HTSeqCountCluster/pbsjob/pbsconfig.py b/HTSeqCountCluster/pbsjob/pbsconfig.py
@@ -4,7 +4,7 @@
 import sys
 import os
 
-from HTSeqCountCluster.pbsjob.pbsutils import randomid
+from HTSeqCountCluster.pbsjob.pbsutils import random_id
 
 if sys.version_info.major < 3:
     raise NotImplementedError('This is not designed for the python version in your \
@@ -15,23 +15,23 @@
 
 _format1 = '%a %b %d %I:%M:%S %p %Y'
 
-_jobname = 'htseq_{}'.format(randomid(length=4))
+_jobname = 'htseq_{}'.format(random_id(length=4))
 
 __DEFAULT__ = {
-            'author': getpass.getuser(),
-            'description': 'This is a default pbs job.',
-            'date': d.now().strftime(_format1),
-            'proj_name': 'htseq-cluster',
-            'select': '1',
-            'memgb': '2gb',
-            'cput': '24:00:00',
-            'wt': '12:00:00',
-            'job_name': _jobname,
-            'outfile': _jobname + '.o',
-            'errfile': _jobname + '.e',
-            'script': _jobname,
-            'log_name': _jobname,
-            'pbsworkdir': os.getcwd(),
-            'cmd': 'python3 ' + os.path.join(os.getcwd(), _jobname + '.py'),
-            'email': 'n/a'
-             }
+    'author': getpass.getuser(),
+    'description': 'This is a default pbs job.',
+    'date': d.now().strftime(_format1),
+    'proj_name': 'htseq-cluster',
+    'select': '1',
+    'memgb': '2gb',
+    'cput': '24:00:00',
+    'wt': '12:00:00',
+    'job_name': _jobname,
+    'outfile': _jobname + '.o',
+    'errfile': _jobname + '.e',
+    'script': _jobname,
+    'log_name': _jobname,
+    'pbsworkdir': os.getcwd(),
+    'cmd': 'python3 ' + os.path.join(os.getcwd(), _jobname + '.py'),
+    'email': 'n/a'
+}
diff --git a/HTSeqCountCluster/pbsjob/pbsjob.py b/HTSeqCountCluster/pbsjob/pbsjob.py
@@ -3,15 +3,16 @@
 from pkg_resources import resource_filename
 
 from HTSeqCountCluster.logger import Logger
-from HTSeqCountCluster.pbsjob.pbsutils import (basejobids, writecodefile,
-                                               import_temp, file2str)
+from HTSeqCountCluster.pbsjob.pbsutils import (basejobids, write_code_file,
+                                               import_temp, file_to_str)
 from HTSeqCountCluster.pbsjob.pbsconfig import __DEFAULT__
 from HTSeqCountCluster import pbsjob
 from HTSeqCountCluster.pbsjob.qstat import Qstat
 
 
 class BasePBSJob(object):
     """Base class for simple jobs."""
+
     def __init__(self, base_jobname):
         """Initialize job attributes."""
         self.default_job_attributes = __DEFAULT__
@@ -43,6 +44,7 @@ def _cleanup(self, jobname):
 
 class PBSJob(BasePBSJob):
     """Create a qsub/pbs job & script for the job to execute."""
+
     def __init__(self, email_address, base_jobname=None):
         super().__init__(base_jobname=base_jobname)
         self.email = email_address
@@ -82,10 +84,13 @@ def submit_code(self, code, cleanup=True, default=True):
             code_str = code
 
         if default:
-            self.sgejob_log.info('You are running a job with default attributes.')
-            writecodefile(filename=self.jobname, code=code_str, language='python')
+            self.sgejob_log.info(
+                'You are running a job with default attributes.')
+            writecodefile(filename=self.jobname,
+                          code=code_str, language='python')
             pyfilename = self.jobname + '.py'
-            self.sgejob_log.info('%s python file has been created.' % pyfilename)
+            self.sgejob_log.info(
+                '%s python file has been created.' % pyfilename)
 
             # Create the pbs script from the template or dict
             pbstemp = import_temp(self.temp_pbs)
@@ -104,7 +109,8 @@ def submit_code(self, code, cleanup=True, default=True):
         try:
             cmd = ['qsub ' + self.jobname + '.pbs']  # this is the command
             # Shell MUST be True
-            cmd_status = run(cmd, stdout=PIPE, stderr=PIPE, shell=True, check=True)
+            cmd_status = run(cmd, stdout=PIPE, stderr=PIPE,
+                             shell=True, check=True)
         except CalledProcessError as err:
             self.sgejob_log.error(err.stderr.decode('utf-8'))
             if cleanup:
@@ -144,7 +150,8 @@ def submit_cmd(self, cmd, cleanup=True):
         try:
             cmd = ['qsub ' + self.jobname + '.pbs']  # this is the command
             # Shell MUST be True
-            cmd_status = run(cmd, stdout=PIPE, stderr=PIPE, shell=True, check=True)
+            cmd_status = run(cmd, stdout=PIPE, stderr=PIPE,
+                             shell=True, check=True)
         except CalledProcessError as err:
             self.sgejob_log.error(err.stderr.decode('utf-8'))
             if cleanup:
@@ -162,4 +169,3 @@ def submit_cmd(self, cmd, cleanup=True):
 
             else:  # Unsuccessful. Stdout will be '1'
                 self.sgejob_log.error('PBS job not submitted.')
-
diff --git a/HTSeqCountCluster/pbsjob/pbsutils.py b/HTSeqCountCluster/pbsjob/pbsutils.py
@@ -8,16 +8,26 @@
 
 
 def basejobids(length, name='submit'):
-    """"Create base job id and name."""
-    base_id = randomid(length=length)
+    """"Create base job id and name.
+
+    :param length: [description]
+    :type length: [type]
+    :param name: [description], defaults to 'submit'
+    :type name: str, optional
+    :return: [description]
+    :rtype: [type]
+    """
+    base_id = random_id(length=length)
     base = name + "_{0}".format(base_id)
 
     return base_id, base
 
 
 def import_temp(filepath):
-    """Import the script or file that you need a template of and that has
-    temp strings.
+    """Import a template file that has template strings.
+
+    :param filepath: [description]
+    :type filepath: [type]
     """
     file_temp = open(filepath, 'r')
     file_str = file_temp.read()
@@ -27,19 +37,23 @@ def import_temp(filepath):
     return file_temp
 
 
-def file2str(filepath):
-    """Turn the contents of a file (python file) into a string."""
+def file_to_str(filepath):
+    """Turn the contents of a file (python file) into a string.
+
+    :param filepath: [description]
+    :type filepath: [type]
+    """
     file_temp = open(filepath, 'r')
     file_str = file_temp.read()
     return file_str
 
 
-def randomid(length=5):
+def random_id(length=5):
     """Generate a random ID of 5 characters to append to qsub job name."""
     return ''.join(random.sample(string.ascii_letters + string.digits, length))
 
 
-def writecodefile(filename, code, language):
+def write_code_file(filename, code, language):
     """Create a python file and write the code to it."""
     if language == 'python':
         with open(filename + '.py', 'w') as pyfile:

diff --git a/HTSeqCountCluster/pbsjob/qstat.py b/HTSeqCountCluster/pbsjob/qstat.py
@@ -3,43 +3,54 @@
 import getpass
 import re
 
+from HTSeqCountCluster.logger import Logger
+
 
 class Qstat(object):
     def __init__(self):
         """Initialize class."""
         _username = getpass.getuser()
         self.username = _username
         self.split_regex = re.compile(r'\s+')
+        self.qstat_log = Logger().default(logname="qstat", logfile=None)
 
     def qstatinfo(self, qstat_path='qstat'):
-        """Retrieve qstat output."""
+        """Retrieve qstat output.
+
+        :param qstat_path: [description], defaults to 'qstat'
+        :type qstat_path: str, optional
+        """
         try:
             qstatinfo = check_output([qstat_path])
         except CalledProcessError as cpe:
             return_code = 'qstat returncode: %s' % cpe.returncode
             std_error = 'qstat standard output: %s' % cpe.stderr
-            print(return_code + '\n' + std_error)
+            self.qstat_log(return_code + '\n' + std_error)
         except FileNotFoundError:
             raise FileNotFoundError('qstat is not on your machine.')
+        else:
+            jobs = self._output_parser(qstatinfo)
 
-        jobs = self._output_parser(qstatinfo)
-
-        return jobs
+            return jobs
 
     def _output_parser(self, output):
         """Parse output from qstat pbs commandline program.
 
         Returns a list of dictionaries for each job.
+
+        :param output: The qstat output.
+        :type output: [type]
         """
         lines = output.decode('utf-8').split('\n')
         del lines[:5]
         jobs = []
         for line in lines:
             els = self.split_regex.split(line)
             try:
-            	j = {"job_id": els[0], "name": els[1], "user": els[2], "elapsed_time": els[3],
-                 	"status": els[4], "queue": els[5]}
-            	jobs.append(j)
+                j = {"job_id": els[0], "name": els[1], "user": els[2],
+                     "elapsed_time": els[3], "status": els[4],
+                     "queue": els[5]}
+                jobs.append(j)
 
             except IndexError:
                 pass

diff --git a/HTSeqCountCluster/utils/__init__.py b/HTSeqCountCluster/utils/__init__.py
@@ -2,8 +2,17 @@
 import pandas as pd
 
 
-def csvtolist(csvfile):
+def csvtolist(csvfile, column=0):
+    """Convert a column of a csv file to a list.
+
+    :param csvfile: A comma delimited file.
+    :type csvfile: str
+    :param column: The number of the column to convert.
+    :type column: int
+    :return: A list
+    :rtype: list
+    """
     df = pd.read_csv(csvfile, header=None)
-    output_list = sorted(list(df[0]))
+    output_list = sorted(list(df[column]))
 
     return output_list