From 813f664bc83b4f729c521471c6dd0b01428169ed Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Mon, 20 Jan 2025 21:43:53 +0900 Subject: [PATCH 1/5] createsubdb support tab separated start end list --- src/util/createsubdb.cpp | 64 +++++++++++++++++++++++++++++++++------- 1 file changed, 53 insertions(+), 11 deletions(-) diff --git a/src/util/createsubdb.cpp b/src/util/createsubdb.cpp index dd20ccc30..c2732a42f 100644 --- a/src/util/createsubdb.cpp +++ b/src/util/createsubdb.cpp @@ -11,9 +11,11 @@ int createsubdb(int argc, const char **argv, const Command& command) { Parameters& par = Parameters::getInstance(); par.parseParameters(argc, argv, command, true, 0, 0); + bool isIndex = false; FILE *orderFile = NULL; if (FileUtil::fileExists(par.db1Index.c_str())) { orderFile = fopen(par.db1Index.c_str(), "r"); + isIndex = true; } else { if(FileUtil::fileExists(par.db1.c_str())){ orderFile = fopen(par.db1.c_str(), "r"); @@ -40,6 +42,9 @@ int createsubdb(int argc, const char **argv, const Command& command) { char dbKey[256]; unsigned int prevKey = 0; bool isOrdered = true; + char* result; + char newLine = '\n'; + std::vector arr; while (getline(&line, &len, orderFile) != -1) { Util::parseKey(line, dbKey); unsigned int key; @@ -61,22 +66,60 @@ int createsubdb(int argc, const char **argv, const Command& command) { Debug(Debug::WARNING) << "Key " << dbKey << " not found in database\n"; continue; } - if (par.subDbMode == Parameters::SUBDB_MODE_SOFT) { - writer.writeIndexEntry(key, reader.getOffset(id), reader.getEntryLen(id), 0); + if (isIndex == true) { + if (par.subDbMode == Parameters::SUBDB_MODE_SOFT) { + writer.writeIndexEntry(key, reader.getOffset(id), reader.getEntryLen(id), 0); + } else { + char* data = reader.getDataUncompressed(id); + size_t originalLength = reader.getEntryLen(id); + size_t entryLength = std::max(originalLength, static_cast(1)) - 1; + + if (isCompressed) { + // copy also the null byte since it contains the information if compressed or not + entryLength = *(reinterpret_cast(data)) + sizeof(unsigned int) + 1; + writer.writeData(data, entryLength, key, 0, false, false); + } else { + writer.writeData(data, entryLength, key, 0, true, false); + } + // do not write null byte since + writer.writeIndexEntry(key, writer.getStart(0), originalLength, 0); + } } else { + arr = Util::split(line, "\t"); char* data = reader.getDataUncompressed(id); size_t originalLength = reader.getEntryLen(id); size_t entryLength = std::max(originalLength, static_cast(1)) - 1; - - if (isCompressed) { - // copy also the null byte since it contains the information if compressed or not - entryLength = *(reinterpret_cast(data)) + sizeof(unsigned int) + 1; - writer.writeData(data, entryLength, key, 0, false, false); + int totalLength = 0; + if (arr.size() == 1) { + if (isCompressed) { + // copy also the null byte since it contains the information if compressed or not + entryLength = *(reinterpret_cast(data)) + sizeof(unsigned int) + 1; + writer.writeData(data, entryLength, key, 0, false, false); + } else { + writer.writeData(data, entryLength, key, 0, true, false); + } + // do not write null byte since + writer.writeIndexEntry(key, writer.getStart(0), originalLength, 0); + } else if (arr.size()%2 == 0) { + Debug(Debug::ERROR) << "Input list not in format\n"; } else { - writer.writeData(data, entryLength, key, 0, true, false); + result = new char[entryLength]; + for (int ord = 0 ; ord < int((arr.size()-1)/2); ord ++) { + if (isCompressed) { + //TODO. erase line below + totalLength+=1; + } else { + int currLength = std::stoi(arr[ord * 2 + 2]) - std::stoi(arr[ord * 2 + 1]) + 1; + strncpy(result + totalLength, data + std::stoi(arr[ord * 2 + 1]), currLength); + totalLength += currLength; + } + } + writer.writeData(result, totalLength, key, 0, false, false); + writer.writeAdd(&newLine, 1, 0); + delete [] result; + result = nullptr; } - // do not write null byte since - writer.writeIndexEntry(key, writer.getStart(0), originalLength, 0); + writer.writeIndexEntry(key, writer.getStart(0), totalLength + 2, 0); } } // merge any kind of sequence database @@ -89,7 +132,6 @@ int createsubdb(int argc, const char **argv, const Command& command) { } DBWriter::writeDbtypeFile(par.db3.c_str(), reader.getDbtype(), isCompressed); DBReader::softlinkDb(par.db2, par.db3, DBFiles::SEQUENCE_ANCILLARY); - free(line); reader.close(); if (fclose(orderFile) != 0) { From 195e74b205e8dd660408da3cbcd0e844368590d4 Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Mon, 20 Jan 2025 22:45:59 +0900 Subject: [PATCH 2/5] how to handle compressed db is unsolved --- src/util/createsubdb.cpp | 66 +++++++++++++++++----------------------- 1 file changed, 28 insertions(+), 38 deletions(-) diff --git a/src/util/createsubdb.cpp b/src/util/createsubdb.cpp index c2732a42f..4805a58d3 100644 --- a/src/util/createsubdb.cpp +++ b/src/util/createsubdb.cpp @@ -47,6 +47,7 @@ int createsubdb(int argc, const char **argv, const Command& command) { std::vector arr; while (getline(&line, &len, orderFile) != -1) { Util::parseKey(line, dbKey); + arr = Util::split(line, "\t"); unsigned int key; if (lookupMode) { size_t lookupId = reader.getLookupIdByAccession(dbKey); @@ -66,60 +67,49 @@ int createsubdb(int argc, const char **argv, const Command& command) { Debug(Debug::WARNING) << "Key " << dbKey << " not found in database\n"; continue; } - if (isIndex == true) { - if (par.subDbMode == Parameters::SUBDB_MODE_SOFT) { - writer.writeIndexEntry(key, reader.getOffset(id), reader.getEntryLen(id), 0); - } else { - char* data = reader.getDataUncompressed(id); - size_t originalLength = reader.getEntryLen(id); - size_t entryLength = std::max(originalLength, static_cast(1)) - 1; + if (par.subDbMode == Parameters::SUBDB_MODE_SOFT) { + writer.writeIndexEntry(key, reader.getOffset(id), reader.getEntryLen(id), 0); + } else if (isIndex == true || arr.size() == 1) { + char* data = reader.getDataUncompressed(id); + size_t originalLength = reader.getEntryLen(id); + size_t entryLength = std::max(originalLength, static_cast(1)) - 1; - if (isCompressed) { - // copy also the null byte since it contains the information if compressed or not - entryLength = *(reinterpret_cast(data)) + sizeof(unsigned int) + 1; - writer.writeData(data, entryLength, key, 0, false, false); - } else { - writer.writeData(data, entryLength, key, 0, true, false); - } - // do not write null byte since - writer.writeIndexEntry(key, writer.getStart(0), originalLength, 0); + if (isCompressed) { + // copy also the null byte since it contains the information if compressed or not + entryLength = *(reinterpret_cast(data)) + sizeof(unsigned int) + 1; + writer.writeData(data, entryLength, key, 0, false, false); + } else { + writer.writeData(data, entryLength, key, 0, true, false); } + // do not write null byte since + writer.writeIndexEntry(key, writer.getStart(0), originalLength, 0); } else { - arr = Util::split(line, "\t"); char* data = reader.getDataUncompressed(id); size_t originalLength = reader.getEntryLen(id); size_t entryLength = std::max(originalLength, static_cast(1)) - 1; int totalLength = 0; - if (arr.size() == 1) { - if (isCompressed) { - // copy also the null byte since it contains the information if compressed or not - entryLength = *(reinterpret_cast(data)) + sizeof(unsigned int) + 1; - writer.writeData(data, entryLength, key, 0, false, false); - } else { - writer.writeData(data, entryLength, key, 0, true, false); - } - // do not write null byte since - writer.writeIndexEntry(key, writer.getStart(0), originalLength, 0); - } else if (arr.size()%2 == 0) { + if (isCompressed) { + entryLength = *(reinterpret_cast(data)) + sizeof(unsigned int) + 1; + } + if (arr.size()%2 == 0) { Debug(Debug::ERROR) << "Input list not in format\n"; } else { result = new char[entryLength]; for (int ord = 0 ; ord < int((arr.size()-1)/2); ord ++) { - if (isCompressed) { - //TODO. erase line below - totalLength+=1; - } else { - int currLength = std::stoi(arr[ord * 2 + 2]) - std::stoi(arr[ord * 2 + 1]) + 1; - strncpy(result + totalLength, data + std::stoi(arr[ord * 2 + 1]), currLength); - totalLength += currLength; - } + int currLength = std::stoi(arr[ord * 2 + 2]) - std::stoi(arr[ord * 2 + 1]) + 1; + strncpy(result + totalLength, data + std::stoi(arr[ord * 2 + 1]), currLength); + totalLength += currLength; + } + if (isCompressed) { + //TODO + } else { + writer.writeData(result, totalLength, key, 0, false, false); } - writer.writeData(result, totalLength, key, 0, false, false); writer.writeAdd(&newLine, 1, 0); + writer.writeIndexEntry(key, writer.getStart(0), totalLength + 2, 0); delete [] result; result = nullptr; } - writer.writeIndexEntry(key, writer.getStart(0), totalLength + 2, 0); } } // merge any kind of sequence database From c0a08aa30cded2a3a4da1b31fc7b38810574fece Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Wed, 19 Feb 2025 13:49:53 +0900 Subject: [PATCH 3/5] createsubdb can go through list of indexes --- src/util/createsubdb.cpp | 40 +++++++++++++++++++++++----------------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/src/util/createsubdb.cpp b/src/util/createsubdb.cpp index 4805a58d3..73fb04858 100644 --- a/src/util/createsubdb.cpp +++ b/src/util/createsubdb.cpp @@ -24,7 +24,8 @@ int createsubdb(int argc, const char **argv, const Command& command) { EXIT(EXIT_FAILURE); } } - + //no multithreading + unsigned int thread_idx = 0; const bool lookupMode = par.dbIdMode == Parameters::ID_MODE_LOOKUP; int dbMode = DBReader::USE_INDEX|DBReader::USE_DATA; if (lookupMode) { @@ -34,7 +35,7 @@ int createsubdb(int argc, const char **argv, const Command& command) { reader.open(DBReader::NOSORT); const bool isCompressed = reader.isCompressed(); - DBWriter writer(par.db3.c_str(), par.db3Index.c_str(), 1, 0, Parameters::DBTYPE_OMIT_FILE); + DBWriter writer(par.db3.c_str(), par.db3Index.c_str(), 1, isCompressed, Parameters::DBTYPE_OMIT_FILE); writer.open(); // getline reallocs automatic char *line = NULL; @@ -44,6 +45,7 @@ int createsubdb(int argc, const char **argv, const Command& command) { bool isOrdered = true; char* result; char newLine = '\n'; + char nullByte = '\0'; std::vector arr; while (getline(&line, &len, orderFile) != -1) { Util::parseKey(line, dbKey); @@ -68,7 +70,7 @@ int createsubdb(int argc, const char **argv, const Command& command) { continue; } if (par.subDbMode == Parameters::SUBDB_MODE_SOFT) { - writer.writeIndexEntry(key, reader.getOffset(id), reader.getEntryLen(id), 0); + writer.writeIndexEntry(key, reader.getOffset(id), reader.getEntryLen(id), thread_idx); } else if (isIndex == true || arr.size() == 1) { char* data = reader.getDataUncompressed(id); size_t originalLength = reader.getEntryLen(id); @@ -77,38 +79,42 @@ int createsubdb(int argc, const char **argv, const Command& command) { if (isCompressed) { // copy also the null byte since it contains the information if compressed or not entryLength = *(reinterpret_cast(data)) + sizeof(unsigned int) + 1; - writer.writeData(data, entryLength, key, 0, false, false); + writer.writeData(data, entryLength, key, thread_idx, false, false); } else { - writer.writeData(data, entryLength, key, 0, true, false); + writer.writeData(data, entryLength, key, thread_idx, true, false); } // do not write null byte since - writer.writeIndexEntry(key, writer.getStart(0), originalLength, 0); + writer.writeIndexEntry(key, writer.getStart(0), originalLength, thread_idx); } else { - char* data = reader.getDataUncompressed(id); - size_t originalLength = reader.getEntryLen(id); - size_t entryLength = std::max(originalLength, static_cast(1)) - 1; - int totalLength = 0; - if (isCompressed) { - entryLength = *(reinterpret_cast(data)) + sizeof(unsigned int) + 1; - } if (arr.size()%2 == 0) { Debug(Debug::ERROR) << "Input list not in format\n"; } else { + char* data; + if (isCompressed) { + data = reader.getDataCompressed(id, thread_idx); + } else { + data = reader.getDataUncompressed(id); + } + size_t entryLength = std::max(reader.getEntryLen(id), static_cast(1)); + int totalLength = 0; result = new char[entryLength]; for (int ord = 0 ; ord < int((arr.size()-1)/2); ord ++) { int currLength = std::stoi(arr[ord * 2 + 2]) - std::stoi(arr[ord * 2 + 1]) + 1; strncpy(result + totalLength, data + std::stoi(arr[ord * 2 + 1]), currLength); totalLength += currLength; } + result[totalLength] = newLine; if (isCompressed) { - //TODO + writer.writeData(result, totalLength + 1, key, thread_idx, true, false); } else { - writer.writeData(result, totalLength, key, 0, false, false); + writer.writeData(result, totalLength, key, thread_idx, false, false); + writer.writeAdd(&newLine, sizeof(char), thread_idx); + writer.writeAdd(&nullByte, sizeof(char), thread_idx); } - writer.writeAdd(&newLine, 1, 0); - writer.writeIndexEntry(key, writer.getStart(0), totalLength + 2, 0); delete [] result; result = nullptr; + + writer.writeIndexEntry(key, writer.getStart(0), totalLength + 2, thread_idx); } } } From 19e19862b98b586145792f57b689b3bf5ea6192b Mon Sep 17 00:00:00 2001 From: Sooyoung Cha Date: Tue, 4 Mar 2025 17:36:35 +0900 Subject: [PATCH 4/5] Update createsubdb.cpp --- src/util/createsubdb.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/util/createsubdb.cpp b/src/util/createsubdb.cpp index 73fb04858..e1ad848fc 100644 --- a/src/util/createsubdb.cpp +++ b/src/util/createsubdb.cpp @@ -71,7 +71,8 @@ int createsubdb(int argc, const char **argv, const Command& command) { } if (par.subDbMode == Parameters::SUBDB_MODE_SOFT) { writer.writeIndexEntry(key, reader.getOffset(id), reader.getEntryLen(id), thread_idx); - } else if (isIndex == true || arr.size() == 1) { + } else if (isIndex == true || arr.size() == 1 || reader.getDbtype() == DBTYPE_GENERIC_DB) { + //how to handel c_alpha char* data = reader.getDataUncompressed(id); size_t originalLength = reader.getEntryLen(id); size_t entryLength = std::max(originalLength, static_cast(1)) - 1; From 1947d7cbd8555a8024ceeee4c9a32409f07ade2a Mon Sep 17 00:00:00 2001 From: SooyoungCha Date: Thu, 13 Mar 2025 02:48:44 +0900 Subject: [PATCH 5/5] added flag for range input --- src/commons/Parameters.cpp | 3 +++ src/commons/Parameters.h | 2 ++ src/util/createsubdb.cpp | 3 +-- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/commons/Parameters.cpp b/src/commons/Parameters.cpp index 3e4b9bd5c..04191ac34 100644 --- a/src/commons/Parameters.cpp +++ b/src/commons/Parameters.cpp @@ -305,6 +305,7 @@ Parameters::Parameters(): PARAM_ID_MODE(PARAM_ID_MODE_ID, "--id-mode", "Database ID mode", "Select DB entries based on 0: database keys, 1: FASTA identifiers (.lookup)", typeid(int), (void *) &dbIdMode, "^[0-1]{1}$"), PARAM_TAR_INCLUDE(PARAM_TAR_INCLUDE_ID, "--tar-include", "Tar Inclusion Regex", "Include file names based on this regex", typeid(std::string), (void *) &tarInclude, "^.*$"), PARAM_TAR_EXCLUDE(PARAM_TAR_EXCLUDE_ID, "--tar-exclude", "Tar Exclusion Regex", "Exclude file names based on this regex", typeid(std::string), (void *) &tarExclude, "^.*$"), + PARAM_INPUT_MODE(PARAM_INPUT_MODE_ID, "--input-mode", "Input list mode", "0: only index, 1: index and range", typeid(int), (void *) &inputmode, "^[0-1]{1}$"), // unpackdb PARAM_UNPACK_SUFFIX(PARAM_UNPACK_SUFFIX_ID, "--unpack-suffix", "Unpack suffix", "File suffix for unpacked files.\nAdd .gz suffix to write compressed files.", typeid(std::string), (void *) &unpackSuffix, "^.*$"), PARAM_UNPACK_NAME_MODE(PARAM_UNPACK_NAME_MODE_ID, "--unpack-name-mode", "Unpack name mode", "Name unpacked files by 0: DB key, 1: accession (through .lookup)", typeid(int), (void *) &unpackNameMode, "^[0-1]{1}$"), @@ -1169,6 +1170,7 @@ Parameters::Parameters(): createsubdb.push_back(&PARAM_SUBDB_MODE); createsubdb.push_back(&PARAM_ID_MODE); createsubdb.push_back(&PARAM_V); + createsubdb.push_back(&PARAM_INPUT_MODE); // renamedbkeys renamedbkeys.push_back(&PARAM_SUBDB_MODE); @@ -2623,6 +2625,7 @@ void Parameters::setDefaults() { // createsubdb subDbMode = Parameters::SUBDB_MODE_HARD; dbIdMode = Parameters::ID_MODE_KEYS; + inputmode = 0; // tar2db tarInclude = ".*"; diff --git a/src/commons/Parameters.h b/src/commons/Parameters.h index 174145011..3004dd90e 100644 --- a/src/commons/Parameters.h +++ b/src/commons/Parameters.h @@ -708,6 +708,7 @@ class Parameters { // createsubdb int subDbMode; int dbIdMode; + int inputmode; // tar2db std::string tarInclude; @@ -1065,6 +1066,7 @@ class Parameters { // createsubdb PARAMETER(PARAM_SUBDB_MODE) PARAMETER(PARAM_ID_MODE) + PARAMETER(PARAM_INPUT_MODE) // tar2db PARAMETER(PARAM_TAR_INCLUDE) diff --git a/src/util/createsubdb.cpp b/src/util/createsubdb.cpp index 73fb04858..13baa4787 100644 --- a/src/util/createsubdb.cpp +++ b/src/util/createsubdb.cpp @@ -71,11 +71,10 @@ int createsubdb(int argc, const char **argv, const Command& command) { } if (par.subDbMode == Parameters::SUBDB_MODE_SOFT) { writer.writeIndexEntry(key, reader.getOffset(id), reader.getEntryLen(id), thread_idx); - } else if (isIndex == true || arr.size() == 1) { + } else if (isIndex == true || arr.size() == 1 || par.inputmode == 0) { char* data = reader.getDataUncompressed(id); size_t originalLength = reader.getEntryLen(id); size_t entryLength = std::max(originalLength, static_cast(1)) - 1; - if (isCompressed) { // copy also the null byte since it contains the information if compressed or not entryLength = *(reinterpret_cast(data)) + sizeof(unsigned int) + 1;