Skip to content

Commit

Permalink
Reimplement splitToWords so it can split path names too.
Browse files Browse the repository at this point in the history
  • Loading branch information
brianwalenz committed May 1, 2018
1 parent a432247 commit 0b4ce31
Show file tree
Hide file tree
Showing 11 changed files with 233 additions and 171 deletions.
193 changes: 101 additions & 92 deletions src/AS_UTL/splitToWords.H
Original file line number Diff line number Diff line change
Expand Up @@ -38,117 +38,126 @@
#ifndef SPLITTOWORDS_H
#define SPLITTOWORDS_H

#include "AS_global.H"

enum splitType {
splitWords = 0,
splitPaths = 1
};

class splitToWords {
public:
splitToWords() {
_argWords = 0;
_maxWords = 0;
_arg = 0L;
_maxChars = 0;
_cmd = 0L;
};
splitToWords(char *cmd) {
_argWords = 0;
_maxWords = 0;
_arg = 0L;
_maxChars = 0;
_cmd = 0L;

split(cmd);
splitToWords(const char *string=NULL, splitType type=splitWords) {
_wordsLen = 0;
_wordsMax = 0;
_words = NULL;

_charsLen = 0;
_charsMax = 0;
_chars = NULL;

if (string)
split(string, type);
};

~splitToWords() {
delete [] _cmd;
delete [] _arg;
delete [] _chars;
delete [] _words;
};

private:
bool isPath(char c) {
return(c == '/');
};

bool isSpace(char c) {
return((c == ' ') ||
(c == '\t') ||
(c == '\n') ||
(c == '\r'));
};

void split(char *cmd) {
bool isSeparator(char c, splitType type) {
return(((type == splitWords) && (isSpace(c))) ||
((type == splitPaths) && (isPath(c))));
};

// Step Zero:
//
// Count the length of the string, in words and in characters.
// For simplicity, we overcount words, by just counting white-space.
//
// Then, allocate space for a temporary copy of the string, and a
// set of pointers into the temporary copy (much like argv).
//
uint32 cmdChars = 1; // 1 == Space for terminating 0
uint32 cmdWords = 2; // 2 == Space for first word and terminating 0L
public:
void split(const char *line, splitType type=splitWords) {

for (char *tmp=cmd; *tmp; tmp++) {
cmdWords += (*tmp == ' ') ? 1 : 0;
cmdWords += (*tmp == '\t') ? 1 : 0;
cmdChars++;
}
_wordsLen = 0; // Initialize to no words
_charsLen = 0; // and no characters.

if (cmdChars > _maxChars) {
delete [] _cmd;
_cmd = new char [cmdChars];
_maxChars = cmdChars;
}
if (cmdWords > _maxWords) {
delete [] _arg;
_arg = new char * [cmdWords];
_maxWords = cmdWords;
}
if (line == NULL) // Bail if there isn't a line to process.
return;

_argWords = 0;

// Step One:
//
// Determine where the words are in the command string, copying the
// string to _cmd and storing words in _arg.
//
bool isFirst = true;
char *cmdI = cmd;
char *cmdO = _cmd;

while (*cmdI) {

// If we are at a non-space character, we are in a word. If
// this is the first character in the word, save the word in
// the args list.
//
// Otherwise we are at a space and thus not in a word. Make
// all spaces be string terminators, and declare that we are
// at the start of a word.
//
if ((*cmdI != ' ') && (*cmdI != '\t') && (*cmdI != '\n') && (*cmdI != '\r')) {
*cmdO = *cmdI;

if (isFirst) {
_arg[_argWords++] = cmdO;
isFirst = false;
}
} else {
*cmdO = 0;
isFirst = true;
}
// Count the number of words and chars in the input line, then make
// sure there is space for us to store them.

cmdI++;
cmdO++;
while (line[_charsLen] != 0)
if (isSeparator(line[_charsLen++], type))
_wordsLen++;

resizeArray(_words, 0, _wordsMax, _wordsLen + 1, resizeArray_doNothing);
resizeArray(_chars, 0, _charsMax, _charsLen + 1, resizeArray_doNothing);

// Clear all the words pointers, and copy the input line to our storage.
// This greatly simplifies the loop, as we don't need to worry about
// terminating the final word.

memset(_words, 0, sizeof(char *) * (_wordsLen + 1));
memcpy(_chars, line, sizeof(char) * (_charsLen + 1));

// Scan the line copy, converting word separators to NUL bytes.
// counting and saving the start of each word in _words.

_wordsLen = 0;

for (uint32 st=1, ii=0; ii < _charsLen; ii++) {
if (isSeparator(line[ii], type)) { // If the character is a word
_chars[ii] = 0; // separator, convert to NUL,
st = true; // and flag the next character
} // as the start of a new word.

else if (st) { // Otherwise, if this is the
_words[_wordsLen++] = _chars + ii; // start of a word, make
st = false; // a new word.
}
}
};

uint32 numWords(void) { return(_wordsLen); };

char *operator[](uint32 i) { return(first(i)); };

char *first(uint32 i=0) { return((_wordsLen <= i) ? NULL : _words[i]); };
char *shift(void) {
if (_wordsLen == 0) // If no words, nothing to return.
return(NULL);

// Finish off the list by terminating the last arg, and
// terminating the list of args.
//
*cmdO = 0;
_arg[_argWords] = 0L;
for (uint32 ii=1; ii<_wordsLen; ii++) // Shift all words down one place, moving
swap(_words[ii-1], _words[ii]); // the word to shift off to the end.

return(_words[--_wordsLen]); // Return the word we shifted out.
};

char *last(uint32 i=0) { return((_wordsLen == 0) ? NULL : _words[_wordsLen - i - 1]); };
char *pop(void) { return((_wordsLen == 0) ? NULL : _words[--_wordsLen]); };

int32 toint32(uint32 i) { return(strtoint32 (_words[i])); };
uint32 touint32(uint32 i) { return(strtouint32(_words[i])); };
int64 toint64(uint32 i) { return(strtoint64 (_words[i])); };
uint64 touint64(uint32 i) { return(strtouint64(_words[i])); };
double todouble(uint32 i) { return(strtodouble(_words[i])); };

uint32 numWords(void) { return(_argWords); };
char *getWord(uint32 i) { return(_arg[i]); };
char *operator[](uint32 i) { return(_arg[i]); };
int64 operator()(uint32 i) { return(strtoull(_arg[i], NULL, 10)); };
private:
uint32 _argWords;
uint32 _maxWords;
char **_arg;
uint32 _maxChars;
char *_cmd;
};
uint32 _wordsLen;
uint32 _wordsMax;
char **_words;

uint32 _charsLen;
uint32 _charsMax;
char *_chars;
};

#endif // SPLITTOWORDS_H
53 changes: 53 additions & 0 deletions src/AS_UTL/splitToWordsTest.C
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@

/******************************************************************************
*
* This file is part of canu, a software program that assembles whole-genome
* sequencing reads into contigs.
*
* This software is based on:
* 'Celera Assembler' (http://wgs-assembler.sourceforge.net)
* the 'kmer package' (http://kmer.sourceforge.net)
* both originally distributed by Applera Corporation under the GNU General
* Public License, version 2.
*
* Canu branched from Celera Assembler at its revision 4587.
* Canu branched from the kmer project at its revision 1994.
*
* Modifications by:
*
* Brian P. Walenz beginning on 2016-MAY-01
* are a 'United States Government Work', and
* are released in the public domain
*
* File 'README.licenses' in the root directory of this distribution contains
* full conditions and disclaimers for each license.
*/

#include "splitToWords.H"

int
main(int argc, char **argv) {
splitToWords W;
splitType type = splitWords;

for (uint32 arg=1; arg<argc; arg++) {
if (strcmp(argv[arg], "-p") == 0) {
type = splitPaths;
continue;
}

if (strcmp(argv[arg], "-w") == 0) {
type = splitWords;
continue;
}

W.split(argv[arg], type);

fprintf(stderr, "'%s'\n", argv[arg]);

for (uint32 ii=0; ii<W.numWords(); ii++)
fprintf(stderr, "%02u - '%s'\n", ii, W[ii]);
}

exit(0);
}
24 changes: 12 additions & 12 deletions src/bogus/bogusUtil.C
Original file line number Diff line number Diff line change
Expand Up @@ -160,11 +160,11 @@ loadNucmer(char *nucmerName,
// Unlike snapper, these are already in base-based coords.

A.frgIID = IIDmap[fID];
A.frgBgn = W(2);
A.frgEnd = W(3);
A.frgBgn = W.toint32(2);
A.frgEnd = W.toint32(3);
A.genIID = refMap[gID];
A.genBgn = W(0);
A.genEnd = W(1);
A.genBgn = W.toint32(0);
A.genEnd = W.toint32(1);
A.chnBgn = refList[A.genIID].rschnBgn + A.genBgn;
A.chnEnd = refList[A.genIID].rschnBgn + A.genEnd;
A.identity = atof(W[wIdent]);
Expand All @@ -173,8 +173,8 @@ loadNucmer(char *nucmerName,
A.isRepeat = true;

if (A.frgBgn > A.frgEnd) {
A.frgBgn = W(3);
A.frgEnd = W(2);
A.frgBgn = W.toint32(3);
A.frgEnd = W.toint32(2);
A.isReverse = true;
}

Expand Down Expand Up @@ -244,11 +244,11 @@ loadSnapper(char *snapperName,
// "+1" -- Convert from space-based coords to base-based coords.

A.frgIID = IIDmap[fID];
A.frgBgn = W(3) + 1;
A.frgEnd = W(4);
A.frgBgn = W.toint32(3) + 1;
A.frgEnd = W.toint32(4);
A.genIID = refMap[gID];
A.genBgn = W(6) + 1;
A.genEnd = W(7);
A.genBgn = W.toint32(6) + 1;
A.genEnd = W.toint32(7);
A.chnBgn = refList[A.genIID].rschnBgn + A.genBgn;
A.chnEnd = refList[A.genIID].rschnBgn + A.genEnd;
A.identity = atof(W[8]);
Expand All @@ -257,8 +257,8 @@ loadSnapper(char *snapperName,
A.isRepeat = true;

if (A.frgBgn > A.frgEnd) {
A.frgBgn = W(4) + 1;
A.frgEnd = W(3);
A.frgBgn = W.toint32(4) + 1;
A.frgEnd = W.toint32(3);
A.isReverse = true;
}

Expand Down
32 changes: 16 additions & 16 deletions src/correction/errorEstimate.C
Original file line number Diff line number Diff line change
Expand Up @@ -99,41 +99,41 @@ main(int argc, char **argv) {
splitToWords W(ovStr);

if (isOvl) {
ov.a_iid = W(0);
ov.b_iid = W(1);
ov.a_iid = W.toint32(0);
ov.b_iid = W.toint32(1);
if (ov.a_iid == ov.b_iid)
continue;
ov.dat.ovl.ahg5 = W(4);
ov.dat.ovl.ahg3 = W(6);
ov.dat.ovl.bhg5 = W(6);
ov.dat.ovl.bhg3 = W(7);
ov.span(W(3));
ov.dat.ovl.ahg5 = W.toint32(4);
ov.dat.ovl.ahg3 = W.toint32(6);
ov.dat.ovl.bhg5 = W.toint32(6);
ov.dat.ovl.bhg3 = W.toint32(7);
ov.span(W.toint32(3));
ov.erate(atof(W[8]));
ov.flipped(W[3][0] == 'I' ? true : false);

} else {
ov.a_iid = W(0);
ov.b_iid = W(1);
ov.a_iid = W.toint32(0);
ov.b_iid = W.toint32(1);

if (ov.a_iid == ov.b_iid)
continue;

assert(W[4][0] == '0');

ov.dat.ovl.ahg5 = W(5);
ov.dat.ovl.ahg3 = W(7) - W(6);
ov.dat.ovl.ahg5 = W.toint32(5);
ov.dat.ovl.ahg3 = W.toint32(7) - W.toint32(6);

if (W[8][0] == '0') {
ov.dat.ovl.bhg5 = W(9);
ov.dat.ovl.bhg3 = W(11) - W(10);
ov.dat.ovl.bhg5 = W.toint32(9);
ov.dat.ovl.bhg3 = W.toint32(11) - W.toint32(10);
ov.flipped(false);
} else {
ov.dat.ovl.bhg3 = W(9);
ov.dat.ovl.bhg5 = W(11) - W(10);
ov.dat.ovl.bhg3 = W.toint32(9);
ov.dat.ovl.bhg5 = W.toint32(11) - W.toint32(10);
ov.flipped(true);
}
ov.erate(atof(W[2]));
ov.span(W(10)-W(9));
ov.span(W.toint32(10)-W.toint32(9));
}

if (ov.erate() == 0.0)
Expand Down
Loading

0 comments on commit 0b4ce31

Please sign in to comment.