Reimplement splitToWords so it can split path names too.

Jolvii85 · May 1, 2018 · 0b4ce31 · 0b4ce31
1 parent a432247
commit 0b4ce31
Show file tree

Hide file tree

Showing 11 changed files with 233 additions and 171 deletions.
diff --git a/src/AS_UTL/splitToWords.H b/src/AS_UTL/splitToWords.H
@@ -38,117 +38,126 @@
 #ifndef SPLITTOWORDS_H
 #define SPLITTOWORDS_H
 
+#include "AS_global.H"
+
+enum splitType {
+  splitWords  = 0,
+  splitPaths  = 1
+};
 
 class splitToWords {
 public:
-  splitToWords() {
-    _argWords = 0;
-    _maxWords = 0;
-    _arg      = 0L;
-    _maxChars = 0;
-    _cmd      = 0L;
-  };
-  splitToWords(char *cmd) {
-    _argWords = 0;
-    _maxWords = 0;
-    _arg      = 0L;
-    _maxChars = 0;
-    _cmd      = 0L;
-
-    split(cmd);
+  splitToWords(const char *string=NULL, splitType type=splitWords) {
+    _wordsLen  = 0;
+    _wordsMax  = 0;
+    _words     = NULL;
+
+    _charsLen = 0;
+    _charsMax = 0;
+    _chars    = NULL;
+
+    if (string)
+      split(string, type);
   };
+
   ~splitToWords() {
-    delete [] _cmd;
-    delete [] _arg;
+    delete [] _chars;
+    delete [] _words;
+  };
+
+private:
+  bool   isPath(char c) {
+    return(c == '/');
   };
 
+  bool   isSpace(char c) {
+    return((c == ' ')  ||
+           (c == '\t') ||
+           (c == '\n') ||
+           (c == '\r'));
+  };
 
-  void   split(char *cmd) {
+  bool   isSeparator(char c, splitType type) {
+    return(((type == splitWords) && (isSpace(c))) ||
+           ((type == splitPaths) && (isPath(c))));
+  };
 
-    //  Step Zero:
-    //
-    //  Count the length of the string, in words and in characters.
-    //  For simplicity, we overcount words, by just counting white-space.
-    //
-    //  Then, allocate space for a temporary copy of the string, and a
-    //  set of pointers into the temporary copy (much like argv).
-    //
-    uint32   cmdChars = 1;  //  1 == Space for terminating 0
-    uint32   cmdWords = 2;  //  2 == Space for first word and terminating 0L
+public:
+  void   split(const char *line, splitType type=splitWords) {
 
-    for (char *tmp=cmd; *tmp; tmp++) {
-      cmdWords += (*tmp == ' ')  ? 1 : 0;
-      cmdWords += (*tmp == '\t') ? 1 : 0;
-      cmdChars++;
-    }
+    _wordsLen = 0;        //  Initialize to no words
+    _charsLen = 0;        //  and no characters.
 
-    if (cmdChars > _maxChars) {
-      delete [] _cmd;
-      _cmd      = new char   [cmdChars];
-      _maxChars = cmdChars;
-    }
-    if (cmdWords > _maxWords) {
-      delete [] _arg;
-      _arg      = new char * [cmdWords];
-      _maxWords = cmdWords;
-    }
+    if (line == NULL)     //  Bail if there isn't a line to process.
+      return;
 
-    _argWords = 0;
-
-    //  Step One:
-    //
-    //  Determine where the words are in the command string, copying the
-    //  string to _cmd and storing words in _arg.
-    //
-    bool           isFirst  = true;
-    char          *cmdI = cmd;
-    char          *cmdO = _cmd;
-
-    while (*cmdI) {
-
-      //  If we are at a non-space character, we are in a word.  If
-      //  this is the first character in the word, save the word in
-      //  the args list.
-      //
-      //  Otherwise we are at a space and thus not in a word.  Make
-      //  all spaces be string terminators, and declare that we are
-      //  at the start of a word.
-      //
-      if ((*cmdI != ' ') && (*cmdI != '\t') && (*cmdI != '\n') && (*cmdI != '\r')) {
-        *cmdO = *cmdI;
-
-        if (isFirst) {
-          _arg[_argWords++] = cmdO;
-          isFirst           = false;
-        }
-      } else {
-        *cmdO   = 0;
-        isFirst = true;
-      }
+    //  Count the number of words and chars in the input line, then make
+    //  sure there is space for us to store them.
 
-      cmdI++;
-      cmdO++;
+    while (line[_charsLen] != 0)
+      if (isSeparator(line[_charsLen++], type))
+        _wordsLen++;
+
+    resizeArray(_words, 0, _wordsMax, _wordsLen + 1, resizeArray_doNothing);
+    resizeArray(_chars, 0, _charsMax, _charsLen + 1, resizeArray_doNothing);
+
+    //  Clear all the words pointers, and copy the input line to our storage.
+    //  This greatly simplifies the loop, as we don't need to worry about
+    //  terminating the final word.
+
+    memset(_words, 0,    sizeof(char *) * (_wordsLen + 1));
+    memcpy(_chars, line, sizeof(char)   * (_charsLen + 1));
+
+    //  Scan the line copy, converting word separators to NUL bytes.
+    //  counting and saving the start of each word in _words.
+
+    _wordsLen = 0;
+
+    for (uint32 st=1, ii=0; ii < _charsLen; ii++) {
+      if (isSeparator(line[ii], type)) {      //  If the character is a word
+        _chars[ii] = 0;                       //  separator, convert to NUL,
+        st         = true;                    //  and flag the next character
+      }                                       //  as the start of a new word.
+
+      else if (st) {                          //  Otherwise, if this is the
+        _words[_wordsLen++] = _chars + ii;    //  start of a word, make
+        st                  = false;          //  a new word.
+      }
     }
+  };
+
+  uint32  numWords(void)        { return(_wordsLen); };
+
+  char   *operator[](uint32 i)  { return(first(i)); };
+
+  char   *first(uint32 i=0)     { return((_wordsLen <= i) ? NULL : _words[i]);  };
+  char   *shift(void)           {
+    if (_wordsLen == 0)                     //  If no words, nothing to return.
+      return(NULL);
 
-    //  Finish off the list by terminating the last arg, and
-    //  terminating the list of args.
-    //
-    *cmdO           = 0;
-    _arg[_argWords] = 0L;
+    for (uint32 ii=1; ii<_wordsLen; ii++)   //  Shift all words down one place, moving
+      swap(_words[ii-1], _words[ii]);       //  the word to shift off to the end.
+
+    return(_words[--_wordsLen]);            //  Return the word we shifted out.
   };
 
+  char   *last(uint32 i=0)      { return((_wordsLen == 0) ? NULL : _words[_wordsLen - i - 1]); };
+  char   *pop(void)             { return((_wordsLen == 0) ? NULL : _words[--_wordsLen]);       };
+
+  int32   toint32(uint32 i)     { return(strtoint32 (_words[i])); };
+  uint32  touint32(uint32 i)    { return(strtouint32(_words[i])); };
+  int64   toint64(uint32 i)     { return(strtoint64 (_words[i])); };
+  uint64  touint64(uint32 i)    { return(strtouint64(_words[i])); };
+  double  todouble(uint32 i)    { return(strtodouble(_words[i])); };
 
-  uint32  numWords(void)        { return(_argWords); };
-  char   *getWord(uint32 i)     { return(_arg[i]); };
-  char   *operator[](uint32 i)  { return(_arg[i]); };
-  int64  operator()(uint32 i)  { return(strtoull(_arg[i], NULL, 10)); };
 private:
-  uint32    _argWords;
-  uint32    _maxWords;
-  char    **_arg;
-  uint32    _maxChars;
-  char     *_cmd;
-};
+  uint32    _wordsLen;
+  uint32    _wordsMax;
+  char    **_words;
 
+  uint32    _charsLen;
+  uint32    _charsMax;
+  char     *_chars;
+};
 
 #endif  //  SPLITTOWORDS_H
diff --git a/src/AS_UTL/splitToWordsTest.C b/src/AS_UTL/splitToWordsTest.C
@@ -0,0 +1,53 @@
+
+/******************************************************************************
+ *
+ *  This file is part of canu, a software program that assembles whole-genome
+ *  sequencing reads into contigs.
+ *
+ *  This software is based on:
+ *    'Celera Assembler' (http://wgs-assembler.sourceforge.net)
+ *    the 'kmer package' (http://kmer.sourceforge.net)
+ *  both originally distributed by Applera Corporation under the GNU General
+ *  Public License, version 2.
+ *
+ *  Canu branched from Celera Assembler at its revision 4587.
+ *  Canu branched from the kmer project at its revision 1994.
+ *
+ *  Modifications by:
+ *
+ *    Brian P. Walenz beginning on 2016-MAY-01
+ *      are a 'United States Government Work', and
+ *      are released in the public domain
+ *
+ *  File 'README.licenses' in the root directory of this distribution contains
+ *  full conditions and disclaimers for each license.
+ */
+
+#include "splitToWords.H"
+
+int
+main(int argc, char **argv) {
+  splitToWords  W;
+  splitType     type = splitWords;
+
+  for (uint32 arg=1; arg<argc; arg++) {
+    if (strcmp(argv[arg], "-p") == 0) {
+      type = splitPaths;
+      continue;
+    }
+
+    if (strcmp(argv[arg], "-w") == 0) {
+      type = splitWords;
+      continue;
+    }
+
+    W.split(argv[arg], type);
+
+    fprintf(stderr, "'%s'\n", argv[arg]);
+
+    for (uint32 ii=0; ii<W.numWords(); ii++)
+      fprintf(stderr, "%02u - '%s'\n", ii, W[ii]);
+  }
+
+  exit(0);
+}
diff --git a/src/bogus/bogusUtil.C b/src/bogus/bogusUtil.C
@@ -160,11 +160,11 @@ loadNucmer(char                       *nucmerName,
     //  Unlike snapper, these are already in base-based coords.
 
     A.frgIID    = IIDmap[fID];
-    A.frgBgn    = W(2);
-    A.frgEnd    = W(3);
+    A.frgBgn    = W.toint32(2);
+    A.frgEnd    = W.toint32(3);
     A.genIID    = refMap[gID];
-    A.genBgn    = W(0);
-    A.genEnd    = W(1);
+    A.genBgn    = W.toint32(0);
+    A.genEnd    = W.toint32(1);
     A.chnBgn    = refList[A.genIID].rschnBgn + A.genBgn;
     A.chnEnd    = refList[A.genIID].rschnBgn + A.genEnd;
     A.identity  = atof(W[wIdent]);
@@ -173,8 +173,8 @@ loadNucmer(char                       *nucmerName,
     A.isRepeat  = true;
 
     if (A.frgBgn > A.frgEnd) {
-      A.frgBgn    = W(3);
-      A.frgEnd    = W(2);
+      A.frgBgn    = W.toint32(3);
+      A.frgEnd    = W.toint32(2);
       A.isReverse = true;
     }
 
@@ -244,11 +244,11 @@ loadSnapper(char                       *snapperName,
     //  "+1" -- Convert from space-based coords to base-based coords.
 
     A.frgIID    = IIDmap[fID];
-    A.frgBgn    = W(3) + 1;
-    A.frgEnd    = W(4);
+    A.frgBgn    = W.toint32(3) + 1;
+    A.frgEnd    = W.toint32(4);
     A.genIID    = refMap[gID];
-    A.genBgn    = W(6) + 1;
-    A.genEnd    = W(7);
+    A.genBgn    = W.toint32(6) + 1;
+    A.genEnd    = W.toint32(7);
     A.chnBgn    = refList[A.genIID].rschnBgn + A.genBgn;
     A.chnEnd    = refList[A.genIID].rschnBgn + A.genEnd;
     A.identity  = atof(W[8]);
@@ -257,8 +257,8 @@ loadSnapper(char                       *snapperName,
     A.isRepeat  = true;
 
     if (A.frgBgn > A.frgEnd) {
-      A.frgBgn    = W(4) + 1;
-      A.frgEnd    = W(3);
+      A.frgBgn    = W.toint32(4) + 1;
+      A.frgEnd    = W.toint32(3);
       A.isReverse = true;
     }
 

diff --git a/src/correction/errorEstimate.C b/src/correction/errorEstimate.C
@@ -99,41 +99,41 @@ main(int argc, char **argv) {
       splitToWords  W(ovStr);
 
       if (isOvl) {
-         ov.a_iid = W(0);
-         ov.b_iid = W(1);
+         ov.a_iid = W.toint32(0);
+         ov.b_iid = W.toint32(1);
          if (ov.a_iid == ov.b_iid)
             continue;
-         ov.dat.ovl.ahg5 = W(4);
-         ov.dat.ovl.ahg3 = W(6);
-         ov.dat.ovl.bhg5 = W(6);
-         ov.dat.ovl.bhg3 = W(7);
-         ov.span(W(3));
+         ov.dat.ovl.ahg5 = W.toint32(4);
+         ov.dat.ovl.ahg3 = W.toint32(6);
+         ov.dat.ovl.bhg5 = W.toint32(6);
+         ov.dat.ovl.bhg3 = W.toint32(7);
+         ov.span(W.toint32(3));
          ov.erate(atof(W[8]));
          ov.flipped(W[3][0] == 'I' ? true : false);
 
       } else {
-         ov.a_iid = W(0);
-         ov.b_iid = W(1);
+         ov.a_iid = W.toint32(0);
+         ov.b_iid = W.toint32(1);
 
          if (ov.a_iid == ov.b_iid)
             continue;
 
          assert(W[4][0] == '0');
 
-         ov.dat.ovl.ahg5 = W(5);
-         ov.dat.ovl.ahg3 = W(7) - W(6);
+         ov.dat.ovl.ahg5 = W.toint32(5);
+         ov.dat.ovl.ahg3 = W.toint32(7) - W.toint32(6);
 
          if (W[8][0] == '0') {
-            ov.dat.ovl.bhg5 = W(9);
-            ov.dat.ovl.bhg3 = W(11) - W(10);
+            ov.dat.ovl.bhg5 = W.toint32(9);
+            ov.dat.ovl.bhg3 = W.toint32(11) - W.toint32(10);
             ov.flipped(false);
          } else {
-            ov.dat.ovl.bhg3 = W(9);
-            ov.dat.ovl.bhg5 = W(11) - W(10);
+            ov.dat.ovl.bhg3 = W.toint32(9);
+            ov.dat.ovl.bhg5 = W.toint32(11) - W.toint32(10);
             ov.flipped(true);
          }
          ov.erate(atof(W[2]));
-         ov.span(W(10)-W(9));
+         ov.span(W.toint32(10)-W.toint32(9));
       }
 
       if (ov.erate() == 0.0)