Skip to content

Commit

Permalink
cosmetic changes, removing debugging lines
Browse files Browse the repository at this point in the history
  • Loading branch information
rkpfeil committed Sep 20, 2023
1 parent 03df386 commit 513078f
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 24 deletions.
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ interested in comparing expression between different replicas/conditions within

### Specifying input data via dataset description file

This option will be deprecated in future releases.
This option will be deprecated in future releases. To process multiple experiments, please use `--yaml` instead.

If you wish to process several independent experiments in a single run, you should provide a dataset description
file via `--fastq_list` or `--bam_list` (see description below).
Expand Down Expand Up @@ -216,22 +216,22 @@ To provide all input files in a single file, you can provide a yaml file via `--
A distinct output folder with individual GTFs and abundance tables will be generated for each experiment.
In this option, bam files with short reads for correction can be provided for each experiment.

The yaml file contains a list of experiments in square brackets. The first entry in the list should be the type of files the experiments contain, written as `data format: ` followed by the type in quitation marks. The type can be either fastq or bam. Each experiment is represented by a set of curly brackets around a set of parameters. Each experiment should have a name and one or multiple input files in either fastq or bam format. Additionally it may contain one or multiple bam files with short reads. The name is provided as `name: ` followed by the experiment name in quotation marks. Both short and long read files are provided as a list of file paths in quotation marks, following `long read files: ` and `illumina bam: ` respectively. Labels for the files can also be set with `labels: `. The number of labels needs to be the same as the number of files. All entries are separated by commata. For example:
The yaml file contains a list of experiments in square brackets. The first entry in the list should be the type of files the experiments contain, written as `data format: ` followed by the type in quotation marks. The type can be either fastq or bam. Each experiment is represented by a set of curly brackets around a set of parameters. Each experiment should have a name and one or multiple input files in either fastq or bam format. Additionally it may contain one or multiple bam files with short reads. The name is provided as `name: ` followed by the experiment name in quotation marks. Both short and long read files are provided as a list of file paths in quotation marks, following `long read files: ` and `illumina bam: ` respectively. Labels for the files can also be set with `labels: `. The number of labels needs to be the same as the number of files. All entries are separated by commata. For example:

```
[
data format: "fastq",
{
name: "experiment1",
long_read_files: [
long read files: [
"/PATH/TO/FILE1.fastq",
"/PATH/TO/FILE2.fastq"
],
illumina bam: ["PATH/TO/ILLUMINA1.bam"]
},
{
name: "experiment2",
long_read_files: [
long read files: [
"/PATH/TO/FILE3.fastq"
],
illumina bam: ["PATH/TO/ILLUMINA2.bam"]
Expand Down
35 changes: 15 additions & 20 deletions src/input_data_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,9 +185,7 @@ def get_samples_from_file(self, file_name):
for lib in sample:
for in_file in lib:
check_input_type(in_file, self.input_type)

#print(sample_files)
#print(illumina_bam)

return sample_files, experiment_names, readable_names_dict, illumina_bam

def has_replicas(self):
Expand All @@ -211,7 +209,6 @@ def get_samples_from_yaml(self, file_name):
if len(t.keys()) > 1:
logger.warning("The first entry should only specify the input data format. Any additional info will be ignored")
if t['data format'] == "bam":
#print("yes")
self.input_type = "bam"
print(self.input_type)
elif t['data format'] == "fastq" or t['data format'] == "fasta":
Expand All @@ -232,12 +229,12 @@ def get_samples_from_yaml(self, file_name):
logger.warning("Duplicate folder prefix %s, will change to %s" %
(current_sample_name, new_sample_name))
current_sample_name = new_sample_name
current_index += 1
if not 'long read files' in sample.keys():
logger.critical("Experiment %s does not contain any files" %current_sample_name)
exit(-2)
else:
current_sample = sample['long read files']
#print(current_sample)
names = 'labels' in sample.keys()
if names and not len(sample['labels']) == len(current_sample):
logger.critical("The number of file aliases differs from the number of files")
Expand All @@ -260,31 +257,29 @@ def get_samples_from_yaml(self, file_name):
else:
illumina_bam.append(None)

# this is one for loop too many check why it works above and see if I have a list too little
# either use extra list or remove one loop. check with andrey if we actually have libs in samples
#print(sample_files)
for sample in sample_files:
for lib in sample:
for in_file in lib:
check_input_type(in_file, self.input_type)
return sample_files, experiment_names, readable_names_dict, illumina_bam

# what do I use here? sample files? or all files?
def get_sample_name(names, index):
common_characters = len(names[0])
common_name = names[0]
# not functional yet
# idea for the future to name unnamed samples by their last common folder
# def get_sample_name(names, index):
# common_characters = len(names[0])
# common_name = names[0]

for i in range(1, len(names)):
p = mismatch(common_name, names[i])
if p[0] < common_characters:
common_characters = p[0]
# for i in range(1, len(names)):
# p = mismatch(common_name, names[i])
# if p[0] < common_characters:
# common_characters = p[0]

found = common_names.rfind('/', 0, common_characters)
# found = common_names.rfind('/', 0, common_characters)

common_name = common_name[:found]
# common_name = common_name[:found]

sample_name = common_name + str(index)
return sample_name
# sample_name = common_name + str(index)
# return sample_name


def check_input_type(fname, input_type):
Expand Down

0 comments on commit 513078f

Please sign in to comment.