Skip to content

Commit

Permalink
Improve documentation (#11)
Browse files Browse the repository at this point in the history
* Add website and documentation links to setup.py

* Add documentation to call_htseq function.

* Refactored function argument.

* Added docstring to csvtolist utils function.

* Added docstrings.
  • Loading branch information
Shaurita Hutchins authored Feb 24, 2022
1 parent 6a36a1e commit 5262774
Show file tree
Hide file tree
Showing 8 changed files with 141 additions and 65 deletions.
47 changes: 39 additions & 8 deletions HTSeqCountCluster/htseq_count_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,34 @@


def call_htseq(infile, gtf, outfile):
"""Call the htseq-count script."""
cmd = 'htseq-count -f bam -s no {} {} -o {}_htseq.out'.format(infile, gtf, outfile)
"""Call the htseq-count script.
:param infile: An alignment file of aligned reads in SAM format.
:type infile: str
:param gtf: The gtf (Gene transfer format) file.
:type gtf: str
:param outfile: The name of the output SAM alignment file.
:type outfile: str
"""
cmd = 'htseq-count -f bam -s no {} {} -o {}_htseq.out'.format(
infile, gtf, outfile)
return cmd


def htseq_jobber(input_path, inputlist, gtf, outpath, email):
"""Create multiple pbs jobs based on input list of files."""
"""Create multiple pbs jobs based on input list of files.
:param input_path: [description]
:type input_path: [type]
:param inputlist: [description]
:type inputlist: [type]
:param gtf: The gtf (Gene transfer format) file.
:type gtf: str
:param outpath: [description]
:type outpath: [type]
:param email: An email address to send notifications.
:type email: str
"""
jobids = []
for item in inputlist:
htseqjob = PBSJob(email_address=email, base_jobname=item)
Expand All @@ -33,7 +54,13 @@ def htseq_jobber(input_path, inputlist, gtf, outpath, email):


def check_job_status(job_id, email=True):
"""Use Qstat to monitor your job status."""
"""Use Qstat to monitor your job status.
:param job_id: The job's id.
:type job_id: str
:param email: A flag to decide whether to send email, defaults to True
:type email: bool, optional
"""
# TODO Allow either slack notifications or email or text.
qwatch = Qstat().watch(job_id)
if qwatch == 'Job id not found.':
Expand All @@ -51,13 +78,17 @@ def main():
description=textwrap.dedent('''\
This is a command line wrapper around htseq-count.
'''))
parser.add_argument('-p', '--inpath', help='Path of your samples/sample folders.', required=True)
parser.add_argument('-f', '--infile', help='Name or path to your input csv file.', required=True)
parser.add_argument('-g', '--gtf', help='Name or path to your gtf/gff file.', required=True)
parser.add_argument('-p', '--inpath', help='Path of your samples/sample folders.',
required=True)
parser.add_argument('-f', '--infile', help='Name or path to your input csv file.',
required=True)
parser.add_argument('-g', '--gtf', help='Name or path to your gtf/gff file.',
required=True)
parser.add_argument('-o', '--outpath',
help='Directory of your output counts file. The counts file will be named.',
required=True)
parser.add_argument('-e', '--email', help='Email address to send script completion to.')
parser.add_argument('-e', '--email',
help='Email address to send script completion to.')

args = parser.parse_args()

Expand Down
21 changes: 13 additions & 8 deletions HTSeqCountCluster/mergecounts.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,33 @@
# -*- coding: utf-8 -*-
import pandas as pd
import os
import argparse
import textwrap

import pandas as pd

from HTSeqCountCluster.logger import Logger

# Create a merge-counts logger
mc_log = Logger().default(logname="merge-counts", logfile=None)


def merge_counts_tables(filesdirectory):
def merge_counts_tables(files_dir):
"""Merge multiple counts tables into 1 counts table.
After running htseq-count-cluster, there will be a counts table for each
sample in the output directory. This function will use the genes column as
the first column and then insert each subsequent sample name as column
header with counts data as the column rows.
:param files_dir: The directory of the individual counts files.
:type files_dir: str
"""
mc_log.info("Running merge-counts script.")
if filesdirectory is ".":
filesdirectory = os.getcwd()
if files_dir is ".":
files_dir = os.getcwd()

mc_log.info("Your directory location is: %s" % filesdirectory)
files = os.listdir(filesdirectory)
mc_log.info("Your directory location is: %s" % files_dir)
files = os.listdir(files_dir)

samplenames = []
sample_dfs = []
Expand All @@ -32,7 +37,7 @@ def merge_counts_tables(filesdirectory):
if ext == 'out':
samplename, barcode = filename.split('-')
samplenames.append(samplename)
filep = os.path.join(filesdirectory, file)
filep = os.path.join(files_dir, file)
data = pd.read_table(filep, header=None,
names=['Genes', samplename])
mc_log.info("A dataframe has been created for %s." % samplename)
Expand Down Expand Up @@ -67,7 +72,7 @@ def main():
type=str)
args = parser.parse_args()

merge_counts_tables(filesdirectory=args.directory)
merge_counts_tables(files_dir=args.directory)


if __name__ == '__main__':
Expand Down
38 changes: 19 additions & 19 deletions HTSeqCountCluster/pbsjob/pbsconfig.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import sys
import os

from HTSeqCountCluster.pbsjob.pbsutils import randomid
from HTSeqCountCluster.pbsjob.pbsutils import random_id

if sys.version_info.major < 3:
raise NotImplementedError('This is not designed for the python version in your \
Expand All @@ -15,23 +15,23 @@

_format1 = '%a %b %d %I:%M:%S %p %Y'

_jobname = 'htseq_{}'.format(randomid(length=4))
_jobname = 'htseq_{}'.format(random_id(length=4))

__DEFAULT__ = {
'author': getpass.getuser(),
'description': 'This is a default pbs job.',
'date': d.now().strftime(_format1),
'proj_name': 'htseq-cluster',
'select': '1',
'memgb': '2gb',
'cput': '24:00:00',
'wt': '12:00:00',
'job_name': _jobname,
'outfile': _jobname + '.o',
'errfile': _jobname + '.e',
'script': _jobname,
'log_name': _jobname,
'pbsworkdir': os.getcwd(),
'cmd': 'python3 ' + os.path.join(os.getcwd(), _jobname + '.py'),
'email': 'n/a'
}
'author': getpass.getuser(),
'description': 'This is a default pbs job.',
'date': d.now().strftime(_format1),
'proj_name': 'htseq-cluster',
'select': '1',
'memgb': '2gb',
'cput': '24:00:00',
'wt': '12:00:00',
'job_name': _jobname,
'outfile': _jobname + '.o',
'errfile': _jobname + '.e',
'script': _jobname,
'log_name': _jobname,
'pbsworkdir': os.getcwd(),
'cmd': 'python3 ' + os.path.join(os.getcwd(), _jobname + '.py'),
'email': 'n/a'
}
22 changes: 14 additions & 8 deletions HTSeqCountCluster/pbsjob/pbsjob.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,16 @@
from pkg_resources import resource_filename

from HTSeqCountCluster.logger import Logger
from HTSeqCountCluster.pbsjob.pbsutils import (basejobids, writecodefile,
import_temp, file2str)
from HTSeqCountCluster.pbsjob.pbsutils import (basejobids, write_code_file,
import_temp, file_to_str)
from HTSeqCountCluster.pbsjob.pbsconfig import __DEFAULT__
from HTSeqCountCluster import pbsjob
from HTSeqCountCluster.pbsjob.qstat import Qstat


class BasePBSJob(object):
"""Base class for simple jobs."""

def __init__(self, base_jobname):
"""Initialize job attributes."""
self.default_job_attributes = __DEFAULT__
Expand Down Expand Up @@ -43,6 +44,7 @@ def _cleanup(self, jobname):

class PBSJob(BasePBSJob):
"""Create a qsub/pbs job & script for the job to execute."""

def __init__(self, email_address, base_jobname=None):
super().__init__(base_jobname=base_jobname)
self.email = email_address
Expand Down Expand Up @@ -82,10 +84,13 @@ def submit_code(self, code, cleanup=True, default=True):
code_str = code

if default:
self.sgejob_log.info('You are running a job with default attributes.')
writecodefile(filename=self.jobname, code=code_str, language='python')
self.sgejob_log.info(
'You are running a job with default attributes.')
writecodefile(filename=self.jobname,
code=code_str, language='python')
pyfilename = self.jobname + '.py'
self.sgejob_log.info('%s python file has been created.' % pyfilename)
self.sgejob_log.info(
'%s python file has been created.' % pyfilename)

# Create the pbs script from the template or dict
pbstemp = import_temp(self.temp_pbs)
Expand All @@ -104,7 +109,8 @@ def submit_code(self, code, cleanup=True, default=True):
try:
cmd = ['qsub ' + self.jobname + '.pbs'] # this is the command
# Shell MUST be True
cmd_status = run(cmd, stdout=PIPE, stderr=PIPE, shell=True, check=True)
cmd_status = run(cmd, stdout=PIPE, stderr=PIPE,
shell=True, check=True)
except CalledProcessError as err:
self.sgejob_log.error(err.stderr.decode('utf-8'))
if cleanup:
Expand Down Expand Up @@ -144,7 +150,8 @@ def submit_cmd(self, cmd, cleanup=True):
try:
cmd = ['qsub ' + self.jobname + '.pbs'] # this is the command
# Shell MUST be True
cmd_status = run(cmd, stdout=PIPE, stderr=PIPE, shell=True, check=True)
cmd_status = run(cmd, stdout=PIPE, stderr=PIPE,
shell=True, check=True)
except CalledProcessError as err:
self.sgejob_log.error(err.stderr.decode('utf-8'))
if cleanup:
Expand All @@ -162,4 +169,3 @@ def submit_cmd(self, cmd, cleanup=True):

else: # Unsuccessful. Stdout will be '1'
self.sgejob_log.error('PBS job not submitted.')

30 changes: 22 additions & 8 deletions HTSeqCountCluster/pbsjob/pbsutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,26 @@


def basejobids(length, name='submit'):
""""Create base job id and name."""
base_id = randomid(length=length)
""""Create base job id and name.
:param length: [description]
:type length: [type]
:param name: [description], defaults to 'submit'
:type name: str, optional
:return: [description]
:rtype: [type]
"""
base_id = random_id(length=length)
base = name + "_{0}".format(base_id)

return base_id, base


def import_temp(filepath):
"""Import the script or file that you need a template of and that has
temp strings.
"""Import a template file that has template strings.
:param filepath: [description]
:type filepath: [type]
"""
file_temp = open(filepath, 'r')
file_str = file_temp.read()
Expand All @@ -27,19 +37,23 @@ def import_temp(filepath):
return file_temp


def file2str(filepath):
"""Turn the contents of a file (python file) into a string."""
def file_to_str(filepath):
"""Turn the contents of a file (python file) into a string.
:param filepath: [description]
:type filepath: [type]
"""
file_temp = open(filepath, 'r')
file_str = file_temp.read()
return file_str


def randomid(length=5):
def random_id(length=5):
"""Generate a random ID of 5 characters to append to qsub job name."""
return ''.join(random.sample(string.ascii_letters + string.digits, length))


def writecodefile(filename, code, language):
def write_code_file(filename, code, language):
"""Create a python file and write the code to it."""
if language == 'python':
with open(filename + '.py', 'w') as pyfile:
Expand Down
27 changes: 19 additions & 8 deletions HTSeqCountCluster/pbsjob/qstat.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,43 +3,54 @@
import getpass
import re

from HTSeqCountCluster.logger import Logger


class Qstat(object):
def __init__(self):
"""Initialize class."""
_username = getpass.getuser()
self.username = _username
self.split_regex = re.compile(r'\s+')
self.qstat_log = Logger().default(logname="qstat", logfile=None)

def qstatinfo(self, qstat_path='qstat'):
"""Retrieve qstat output."""
"""Retrieve qstat output.
:param qstat_path: [description], defaults to 'qstat'
:type qstat_path: str, optional
"""
try:
qstatinfo = check_output([qstat_path])
except CalledProcessError as cpe:
return_code = 'qstat returncode: %s' % cpe.returncode
std_error = 'qstat standard output: %s' % cpe.stderr
print(return_code + '\n' + std_error)
self.qstat_log(return_code + '\n' + std_error)
except FileNotFoundError:
raise FileNotFoundError('qstat is not on your machine.')
else:
jobs = self._output_parser(qstatinfo)

jobs = self._output_parser(qstatinfo)

return jobs
return jobs

def _output_parser(self, output):
"""Parse output from qstat pbs commandline program.
Returns a list of dictionaries for each job.
:param output: The qstat output.
:type output: [type]
"""
lines = output.decode('utf-8').split('\n')
del lines[:5]
jobs = []
for line in lines:
els = self.split_regex.split(line)
try:
j = {"job_id": els[0], "name": els[1], "user": els[2], "elapsed_time": els[3],
"status": els[4], "queue": els[5]}
jobs.append(j)
j = {"job_id": els[0], "name": els[1], "user": els[2],
"elapsed_time": els[3], "status": els[4],
"queue": els[5]}
jobs.append(j)

except IndexError:
pass
Expand Down
13 changes: 11 additions & 2 deletions HTSeqCountCluster/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,17 @@
import pandas as pd


def csvtolist(csvfile):
def csvtolist(csvfile, column=0):
"""Convert a column of a csv file to a list.
:param csvfile: A comma delimited file.
:type csvfile: str
:param column: The number of the column to convert.
:type column: int
:return: A list
:rtype: list
"""
df = pd.read_csv(csvfile, header=None)
output_list = sorted(list(df[0]))
output_list = sorted(list(df[column]))

return output_list
Loading

0 comments on commit 5262774

Please sign in to comment.