Skip to content

Commit

Permalink
Merge pull request #96 from ekg/temp_dir
Browse files Browse the repository at this point in the history
Temporary directory management
  • Loading branch information
AndreaGuarracino authored May 10, 2022
2 parents 706ef7e + c52262d commit f209482
Show file tree
Hide file tree
Showing 7 changed files with 212 additions and 70 deletions.
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,7 @@ set(CMAKE_BUILD_TYPE Release)
# set up our target executable and specify its dependencies and includes
add_executable(seqwish
${CMAKE_SOURCE_DIR}/src/utils.cpp
${CMAKE_SOURCE_DIR}/src/tempfile.cpp
${CMAKE_SOURCE_DIR}/src/main.cpp
${CMAKE_SOURCE_DIR}/src/seqindex.cpp
${CMAKE_SOURCE_DIR}/src/paf.cpp
Expand Down
47 changes: 19 additions & 28 deletions src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include "time.hpp"
#include "utils.hpp"
#include "version.hpp"
#include "tempfile.hpp"

using namespace seqwish;

Expand All @@ -30,10 +31,10 @@ int main(int argc, char** argv) {
args::HelpFlag help(parser, "help", "display this help menu", {'h', "help"});
args::ValueFlag<std::string> paf_alns(parser, "FILE", "Induce the graph from these PAF formatted alignments. Optionally, a list of filenames and minimum match lengths: [file_1][:min_match_length_1],... This allows the differential filtering of short matches from some but not all inputs, in effect allowing `-k` to be specified differently for each input.", {'p', "paf-alns"});
args::ValueFlag<std::string> seqs(parser, "FILE", "The sequences used to generate the alignments (FASTA, FASTQ, .seq)", {'s', "seqs"});
args::ValueFlag<std::string> base(parser, "BASE", "Build graph using this basename", {'b', "base"});
args::ValueFlag<std::string> tmp_base(parser, "PATH", "directory for temporary files [default: `pwd`]", {'b', "temp-dir"});
args::ValueFlag<std::string> gfa_out(parser, "FILE", "Write the graph in GFA to FILE", {'g', "gfa"});
args::ValueFlag<std::string> sml_in(parser, "FILE", "Use the sequence match list in FILE to subset the input alignments", {'m', "match-list"});
args::ValueFlag<std::string> vgp_base(parser, "BASE", "Write the graph in VGP format with basename FILE", {'o', "vgp-out"});
//args::ValueFlag<std::string> vgp_base(parser, "BASE", "Write the graph in VGP format with basename FILE", {'o', "vgp-out"});
args::ValueFlag<int> thread_count(parser, "N", "Use this many threads during parallel steps", {'t', "threads"});
args::ValueFlag<uint64_t> repeat_max(parser, "N", "Limit transitive closure to include no more than N copies of a given input base", {'r', "repeat-max"});
args::ValueFlag<uint64_t> min_repeat_dist(parser, "N", "Prevent transitive closure for bases at least this far apart in input sequences", {'l', "min-repeat-distance"});
Expand Down Expand Up @@ -112,23 +113,27 @@ int main(int argc, char** argv) {
}
}

std::string work_base = args::get(base);
if (work_base.empty()) {
work_base = args::get(gfa_out);
if (tmp_base) {
temp_file::set_dir(args::get(tmp_base));
} else {
char* cwd = get_current_dir_name();
temp_file::set_dir(std::string(cwd));
free(cwd);
}

temp_file::set_keep_temp(args::get(keep_temp_files));

// 1) index the queries (Q) to provide sequence name to position and position to sequence name mapping, generating a CSA and a sequence file
if (args::get(show_progress)) std::cerr << "[seqwish::seqidx] " << std::fixed << std::showpoint << std::setprecision(3) << seconds_since(start_time) << " indexing sequences" << std::endl;
auto seqidx_ptr = std::make_unique<seqindex_t>();
auto& seqidx = *seqidx_ptr;
seqidx.build_index(args::get(seqs), work_base);
seqidx.build_index(args::get(seqs));
seqidx.save();
if (args::get(show_progress)) std::cerr << "[seqwish::seqidx] " << std::fixed << std::showpoint << std::setprecision(3) << seconds_since(start_time) << " index built" << std::endl;

// 2) parse the alignments into position pairs and index (A)
if (args::get(show_progress)) std::cerr << "[seqwish::alignments] " << std::fixed << std::showpoint << std::setprecision(3) << seconds_since(start_time) << " processing alignments" << std::endl;
std::string aln_idx = work_base + ".sqa";
std::remove(aln_idx.c_str());
const std::string aln_idx = temp_file::create("seqwish-", ".sqa");
auto aln_iitree_ptr = std::make_unique<mmmulti::iitree<uint64_t, pos_t>>(aln_idx);
auto& aln_iitree = *aln_iitree_ptr;
aln_iitree.open_writer();
Expand Down Expand Up @@ -157,12 +162,9 @@ int main(int argc, char** argv) {
}

// 3) find the transitive closures via the alignments and construct the graph sequence S, and the N and P interval sets
std::string seq_v_file = work_base + ".sqs";
std::string node_iitree_idx = work_base + ".sqn";
std::string path_iitree_idx = work_base + ".sqp";
std::remove(seq_v_file.c_str());
std::remove(node_iitree_idx.c_str());
std::remove(path_iitree_idx.c_str());
const std::string seq_v_file = temp_file::create("seqwish-", ".sqs");
const std::string node_iitree_idx = temp_file::create("seqwish-", ".sqn");
const std::string path_iitree_idx = temp_file::create("seqwish-", ".sqp");
auto node_iitree_ptr = std::make_unique<mmmulti::iitree<uint64_t, pos_t>>(node_iitree_idx); // maps graph seq to input seq
auto& node_iitree = *node_iitree_ptr;
auto path_iitree_ptr = std::make_unique<mmmulti::iitree<uint64_t, pos_t>>(path_iitree_idx); // maps input seq to graph seq
Expand Down Expand Up @@ -203,8 +205,7 @@ int main(int argc, char** argv) {

// 5) determine links between nodes
if (args::get(show_progress)) std::cerr << "[seqwish::links] " << std::fixed << std::showpoint << std::setprecision(3) << seconds_since(start_time) << " finding graph links" << std::endl;
std::string link_mm_idx = work_base + ".sql";
std::remove(link_mm_idx.c_str());
const std::string link_mm_idx = temp_file::create("seqwish-", ".sql");
auto link_mmset_ptr = std::make_unique<mmmulti::set<std::pair<pos_t, pos_t>>>(link_mm_idx);
auto& link_mmset = *link_mmset_ptr;
derive_links(seqidx, node_iitree, path_iitree, seq_id_cbv, seq_id_cbv_rank, seq_id_cbv_select, link_mmset, num_threads);
Expand All @@ -215,23 +216,13 @@ int main(int argc, char** argv) {
if (!args::get(gfa_out).empty()) {
std::ofstream out(args::get(gfa_out).c_str());
emit_gfa(out, graph_length, seq_v_file, node_iitree, path_iitree, seq_id_cbv, seq_id_cbv_rank, seq_id_cbv_select, seqidx, link_mmset, num_threads);
} else if (!args::get(vgp_base).empty()) {
/*} else if (!args::get(vgp_base).empty()) {
assert(false);
//emit_vgp(args::get(vgp_base), graph_length, seq_v_file, path_mm, link_fwd_mm, link_rev_mm, seq_id_cbv, seq_id_cbv_rank, seq_id_cbv_select, seqidx);
} else {
*/} else {
emit_gfa(std::cout, graph_length, seq_v_file, node_iitree, path_iitree, seq_id_cbv, seq_id_cbv_rank, seq_id_cbv_select, seqidx, link_mmset, num_threads);
}
if (args::get(show_progress)) std::cerr << "[seqwish::gfa] " << std::fixed << std::showpoint << std::setprecision(3) << seconds_since(start_time) << " done" << std::endl;

if (!args::get(keep_temp_files)) {
seqidx.remove_index_files();
std::remove(aln_idx.c_str());
std::remove(seq_v_file.c_str());
std::remove(node_iitree_idx.c_str());
std::remove(path_iitree_idx.c_str());
link_mmset.close_reader();
std::remove(link_mm_idx.c_str());
}

return(0);
}
20 changes: 10 additions & 10 deletions src/seqindex.cpp
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
#include "seqindex.hpp"
#include "tempfile.hpp"

namespace seqwish {

// load a FASTA or FASTQ file into a file with a name index mapping name -> offset and indexed with a CSA
// provide queries over this index that let us extract particular positions and subsequences
void seqindex_t::set_base_filename(const std::string& filename) {
basefilename = filename;
seqfilename = basefilename + ".sqq";
seqidxfile = basefilename + ".sqi";
seqnamefile = basefilename + ".sqi.seqnames.tmp"; // used during construction
void seqindex_t::set_base_filename() {
seqfilename = temp_file::create("seqwish-", ".sqq");
seqidxfile = temp_file::create("seqwish-", ".sqi");
seqnamefile = temp_file::create("seqwish-", ".sqi.seqnames.tmp"); // used during construction
}

void seqindex_t::build_index(const std::string& filename, const std::string& idxbasename) {
set_base_filename(idxbasename);
void seqindex_t::build_index(const std::string& filename) {
set_base_filename();
// read the file
igzstream in(filename.c_str());
std::ofstream seqnames(seqnamefile.c_str());
Expand Down Expand Up @@ -128,7 +128,7 @@ void seqindex_t::build_index(const std::string& filename, const std::string& idx
// look up each sequence by name
}

size_t seqindex_t::save(sdsl::structure_tree_node* s, std::string name) {
size_t seqindex_t::save(sdsl::structure_tree_node* s, const std::string& name) {
//assert(seq_name_csa.size() && seq_name_cbv.size() && seq_offset_civ.size());
sdsl::structure_tree_node* child = sdsl::structure_tree::add_child(s, name, sdsl::util::class_name(*this));
// open the sdsl index
Expand All @@ -150,7 +150,7 @@ size_t seqindex_t::save(sdsl::structure_tree_node* s, std::string name) {
return written;
}

void seqindex_t::remove_index_files(void) {
void seqindex_t::remove_index_files() {
std::remove(seqfilename.c_str());
std::remove(seqidxfile.c_str());
}
Expand Down Expand Up @@ -192,7 +192,7 @@ void seqindex_t::close_seq(void) {
}

void seqindex_t::load(const std::string& filename) {
set_base_filename(filename);
set_base_filename();
std::ifstream in(seqidxfile.c_str());
std::string magic;
in.read((char*)magic.c_str(), 6);
Expand Down
6 changes: 3 additions & 3 deletions src/seqindex.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,9 @@ class seqindex_t {

seqindex_t(void) { }
~seqindex_t(void) { close_seq(); }
void set_base_filename(const std::string& filename);
void build_index(const std::string& filename, const std::string& idxbasename);
size_t save(sdsl::structure_tree_node* s = NULL, std::string name = "");
void set_base_filename();
void build_index(const std::string& filename);
size_t save(sdsl::structure_tree_node* s = NULL, const std::string& name = "");
void load(const std::string& filename);
void remove_index_files(void);
void to_fasta(std::ostream& out, size_t linewidth = 60) const;
Expand Down
117 changes: 117 additions & 0 deletions src/tempfile.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
#include <set>
#include <iostream>
#include <unistd.h>
#include "tempfile.hpp"

namespace temp_file {

// We use this to make the API thread-safe
std::recursive_mutex monitor;

std::string temp_dir;

bool keep_temp = false;

/// Because the names are in a static object, we can delete them when
/// std::exit() is called.
struct Handler {
std::set <std::string> filenames;
std::string parent_directory;

~Handler() {
if (!keep_temp) {
// No need to lock in static destructor
for (auto &filename : filenames) {
std::remove(filename.c_str());
}
if (!parent_directory.empty()) {
// There may be extraneous files in the directory still (like .fai files)
auto directory = opendir(parent_directory.c_str());

dirent *dp;
while ((dp = readdir(directory)) != nullptr) {
// For every item still in it, delete it.
// TODO: Maybe eventually recursively delete?
std::remove((parent_directory + "/" + dp->d_name).c_str());
}
closedir(directory);

// Delete the directory itself
std::remove(parent_directory.c_str());
}
}

}
} handler;

std::string create(const std::string &base, const std::string& suffix) {
std::lock_guard <std::recursive_mutex> lock(monitor);

if (handler.parent_directory.empty()) {
// Make a parent directory for our temp files
std::string tmpdirname_cpp = get_dir() + "/" + base + "XXXXXX";
char *tmpdirname = new char[tmpdirname_cpp.length() + 1];
strcpy(tmpdirname, tmpdirname_cpp.c_str());
auto got = mkdtemp(tmpdirname);
if (got != nullptr) {
// Save the directory we got
handler.parent_directory = got;
} else {
std::cerr << "[seqwish::tempfile]: couldn't create temp directory: " << tmpdirname << std::endl;
exit(1);
}
delete[] tmpdirname;
}

std::string tmpname = handler.parent_directory + "/XXXXXX" + suffix;
// hack to use mkstemp to get us a safe temporary file name
int fd = mkstemps(&tmpname[0], suffix.size());
if (fd != -1) {
// we don't leave it open; we are assumed to open it again externally
close(fd);
} else {
std::cerr << "[seqwish::tempfile]: couldn't create temp file on base "
<< base << " : " << tmpname << std::endl;
exit(1);
}
handler.filenames.insert(tmpname);
return tmpname;
}

void remove(const std::string &filename) {
std::lock_guard <std::recursive_mutex> lock(monitor);

std::remove(filename.c_str());
handler.filenames.erase(filename);
}

void set_dir(const std::string &new_temp_dir) {
std::lock_guard <std::recursive_mutex> lock(monitor);

temp_dir = new_temp_dir;
}

std::string get_dir() {
std::lock_guard <std::recursive_mutex> lock(monitor);

// Get the default temp dir from environment variables.
if (temp_dir.empty()) {
char* cwd = get_current_dir_name();
temp_dir = std::string(cwd);
free(cwd);
/*const char *system_temp_dir = nullptr;
for (const char *var_name : {"TMPDIR", "TMP", "TEMP", "TEMPDIR", "USERPROFILE"}) {
if (system_temp_dir == nullptr) {
system_temp_dir = getenv(var_name);
}
}
temp_dir = (system_temp_dir == nullptr ? "/tmp" : system_temp_dir);*/
}

return temp_dir;
}

void set_keep_temp(bool setting) {
keep_temp = setting;
}
}
33 changes: 33 additions & 0 deletions src/tempfile.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#pragma once

#include <string>
#include <mutex>
#include <cstdio>
#include <dirent.h>

#include <cstring>

/**
* Temporary files. Create with create() and remove with remove(). All
* temporary files will be deleted when the program exits normally or with
* std::exit(). The files will be created in a directory determined from
* environment variables, though this can be overridden with set_dir().
* The interface is thread-safe.
*/
namespace temp_file {

/// Create a temporary file starting with the given base name
std::string create(const std::string& base, const std::string& suffix);

/// Remove a temporary file
void remove(const std::string& filename);

/// Set a temp dir, overriding system defaults and environment variables.
void set_dir(const std::string& new_temp_dir);

/// Get the current temp dir
std::string get_dir();

void set_keep_temp(bool setting);

} // namespace temp_file
Loading

0 comments on commit f209482

Please sign in to comment.