Unescape strings during sema pass.

fubark · Mar 9, 2024 · 2bddd40 · 2bddd40
1 parent 2502ee7
commit 2bddd40
Show file tree

Hide file tree

Showing 9 changed files with 70 additions and 35 deletions.
diff --git a/src/ast.zig b/src/ast.zig
@@ -73,6 +73,7 @@ pub const NodeType = enum(u8) {
     opAssignStmt,
     passStmt,
     range,
+    raw_string_lit,
     recordLit,
     returnExprStmt,
     returnStmt,
@@ -140,7 +141,7 @@ const NodeHead = packed struct {
     },
 };
 
-/// At most 16 bytes in release mode.
+/// At most 8 bytes for ReleaseFast.
 const NodeData = union {
     uninit: void,
     expandOpt: struct {
@@ -319,7 +320,7 @@ const NodeData = union {
         name: NodeId,
         typeSpec: NodeId,
     },
-    type_copy_decl: struct {
+    type_copy_decl: packed struct {
         header: NodeId,
         func_head: u24,
         num_funcs: u8,
@@ -415,7 +416,7 @@ const NodeData = union {
         numParams: u8,
         typeDecl: NodeId,
     },
-    range: struct {
+    range: packed struct {
         start: cy.Nullable(NodeId),
         end: cy.Nullable(u24),
         inc: bool,
@@ -563,8 +564,8 @@ pub const UnaryOp = enum(u8) {
 test "ast internals." {
     if (builtin.mode == .ReleaseFast) {
         try t.eq(@sizeOf(NodeHead), 4);
-        try t.eq(@sizeOf(NodeData), 16);
-        try t.eq(@sizeOf(Node), 24);
+        try t.eq(@sizeOf(NodeData), 8);
+        try t.eq(@sizeOf(Node), 16);
     } else {
         try t.eq(@sizeOf(NodeHead), 4);
         try t.eq(@sizeOf(NodeData), 16);
@@ -583,6 +584,9 @@ pub const Ast = struct {
     templateCtNodes: std.ArrayListUnmanaged(NodeId),
 
     /// Heap generated strings, stable pointers unlike `srcGen`.
+    /// Used for:
+    /// - Unnamed struct identifiers.
+    /// - Unescaped strings.
     strs: std.ArrayListUnmanaged([]const u8),
 
     /// Optionally parsed by tokenizer.
@@ -790,7 +794,7 @@ pub const AstView = struct {
             const lastName = self.nodeString(last);
 
             var end = last.srcPos + last.data.span.len;
-            if (last.type() == .stringLit) {
+            if (last.type() == .raw_string_lit) {
                 end += 1;
             }
             return .{
@@ -1009,6 +1013,9 @@ pub const Encoder = struct {
             .ident => {
                 try w.writeAll(self.ast.nodeString(node));
             },
+            .raw_string_lit => {
+                try w.writeAll(self.ast.nodeStringAndDelim(node));
+            },
             .stringLit => {
                 try w.writeAll(self.ast.nodeStringAndDelim(node));
             },

diff --git a/src/bc_gen.zig b/src/bc_gen.zig
@@ -1136,12 +1136,11 @@ fn genStringTemplate(c: *Chunk, idx: usize, cstr: Cstr, nodeId: cy.NodeId) !GenV
 
 fn genString(c: *Chunk, idx: usize, cstr: Cstr, nodeId: cy.NodeId) !GenValue {
     const data = c.ir.getExprData(idx, .string);
-    const str = try c.unescapeString(data.literal);
     const inst = try c.rega.selectForNoErrNoDepInst(cstr, true, nodeId);
     if (inst.requiresPreRelease) {
         try pushRelease(c, inst.dst, nodeId);
     }
-    try pushStringConst(c, str, inst.dst, nodeId);
+    try pushStringConst(c, data.raw, inst.dst, nodeId);
     return finishNoErrNoDepInst(c, inst, true);
 }
 

diff --git a/src/cgen.zig b/src/cgen.zig
@@ -1161,7 +1161,7 @@ fn genString(c: *Chunk, loc: usize, cstr: Cstr, nodeId: cy.NodeId) !Value {
     // if (inst.requiresPreRelease) {
     //     try pushRelease(c, inst.dst, nodeId);
     // }
-    const c_lit = try cStringLit(c, data.literal);
+    const c_lit = try cStringLit(c, data.raw);
 
     try c.bufPushFmt("STRING(\"{s}\")", .{c_lit});
 
@@ -1877,12 +1877,9 @@ fn retExprStmt(c: *Chunk, loc: usize, nodeId: cy.NodeId) !void {
     // }
 }
 
-pub fn cStringLit(self: *Chunk, lit: []const u8) ![]const u8 {
-    // Big enough to hold unescaped lit and escape c lit.
-    try self.base.tempBufU8.resize(self.alloc, lit.len * 3);
-
-    // First unescape Cyber literal.
-    const unescaped = try cy.sema.unescapeString(self.base.tempBufU8.items[0..lit.len], lit, false);
+pub fn cStringLit(self: *Chunk, raw: []const u8) ![]const u8 {
+    // Big enough to hold escaped C literal.
+    try self.base.tempBufU8.resize(self.alloc, raw.len * 2);
 
     // Escape to C literal.
     const ReplaceChars = "\\\"";
@@ -1895,19 +1892,19 @@ pub fn cStringLit(self: *Chunk, lit: []const u8) ![]const u8 {
             };
         }
     };
-    if (std.mem.indexOfAny(u8, unescaped, ReplaceChars)) |idx| {
-        var fbuf = std.io.fixedBufferStream(self.base.tempBufU8.items[lit.len..]);
+    if (std.mem.indexOfAny(u8, raw, ReplaceChars)) |idx| {
+        var fbuf = std.io.fixedBufferStream(self.base.tempBufU8.items[0..]);
         const w = fbuf.writer();
-        try w.print("{s}{s}", .{unescaped[0..idx], S.replacement(unescaped[idx])});
+        try w.print("{s}{s}", .{raw[0..idx], S.replacement(raw[idx])});
 
-        var rest = unescaped[idx+1..];
+        var rest = raw[idx+1..];
         while (std.mem.indexOfAny(u8, rest, ReplaceChars)) |idx2| {
             try w.print("{s}{s}", .{rest[0..idx2], S.replacement(rest[idx2])});
             rest = rest[idx2+1..];
         }
         try w.writeAll(rest);
         return fbuf.getWritten();
     } else {
-        return unescaped;
+        return raw;
     }
-}
+}
diff --git a/src/cyon.zig b/src/cyon.zig
@@ -449,7 +449,10 @@ pub const DecodeMapIR = struct {
     pub fn allocString(self: DecodeMapIR, key: []const u8) ![]const u8 {
         if (self.map.get(key)) |val_id| {
             const val_n = self.ast.node(val_id);
-            if (val_n.type() == .stringLit) {
+            if (val_n.type() == .raw_string_lit) {
+                const token_s = self.ast.nodeString(val_n);
+                return try self.alloc.dupe(u8, token_s);
+            } else if (val_n.type() == .stringLit) {
                 const token_s = self.ast.nodeString(val_n);
                 var buf = std.ArrayList(u8).init(self.alloc);
                 defer buf.deinit();
@@ -587,6 +590,7 @@ pub const DecodeValueIR = struct {
         switch (node.type()) {
             .arrayLit => return .list,
             .recordLit => return .map,
+            .raw_string_lit,
             .stringLit => return .string,
             .hexLit,
             .binLit,

diff --git a/src/ir.zig b/src/ir.zig
@@ -534,7 +534,7 @@ pub const Array = struct {
 };
 
 pub const String = struct {
-    literal: []const u8,
+    raw: []const u8,
 };
 
 pub const StringTemplate = struct {

diff --git a/src/parser.zig b/src/parser.zig
@@ -553,6 +553,10 @@ pub const Parser = struct {
                 self.advance();
                 return try self.pushSpanNode(.ident, start);
             },
+            .raw_string => {
+                self.advance();
+                return try self.pushSpanNode(.raw_string_lit, start);
+            },
             .string => {
                 self.advance();
                 return try self.pushSpanNode(.stringLit, start);
@@ -1429,7 +1433,7 @@ pub const Parser = struct {
                     return self.reportError("Expected import specifier.", &.{});
                 };
                 const spec_t = self.ast.nodeType(spec);
-                if (spec_t == .stringLit) {
+                if (spec_t == .raw_string_lit) {
                     try self.consumeNewLineOrEnd();
                 } else {
                     return self.reportError("Expected import specifier to be a string. {}", &.{fmt.v(spec_t)});
@@ -2929,6 +2933,10 @@ pub const Parser = struct {
                 self.advance();
                 break :b try self.pushSpanNode(.runeLit, start);
             },
+            .raw_string => b: {
+                self.advance();
+                break :b try self.pushSpanNode(.raw_string_lit, start);
+            },
             .string => b: {
                 self.advance();
                 break :b try self.pushSpanNode(.stringLit, start);
@@ -3082,6 +3090,7 @@ pub const Parser = struct {
                 .and_k,
                 .as_k,
                 .capture,
+                .raw_string,
                 .string,
                 .bin,
                 .oct,
@@ -3837,7 +3846,7 @@ fn isRecedingIndent(p: *Parser, prevIndent: u32, curIndent: u32, indent: u32) !b
 fn isRecordKeyNodeType(node_t: cy.NodeType) bool {
     switch (node_t) {
         .ident,
-        .stringLit,
+        .raw_string_lit,
         .decLit,
         .binLit,
         .octLit,

diff --git a/src/sema.zig b/src/sema.zig
@@ -614,7 +614,7 @@ pub fn semaStmt(c: *cy.Chunk, nodeId: cy.NodeId) !cy.NodeId {
                     }
 
                     const arg = c.ast.node(expr.data.callExpr.argHead);
-                    if (arg.type() != .stringLit) {
+                    if (arg.type() != .stringLit and arg.type() != .raw_string_lit) {
                         return c.reportErrorFmt("genLabel expected string arg", &.{}, nodeId);
                     }
 
@@ -2290,7 +2290,7 @@ fn resolveLocalDeclNamePath(c: *cy.Chunk, nameId: cy.NodeId) !DeclNamePathResult
         const name = c.ast.nodeString(last);
 
         var end = last.srcPos + name.len;
-        if (last.type() == .stringLit) {
+        if (last.type() == .raw_string_lit) {
             end += 1;
         }
         const namePath = c.ast.src[nameN.srcPos..end];
@@ -3820,6 +3820,7 @@ pub const ChunkExt = struct {
                 return try semaIdent(c, nodeId, true);
             },
             .stringLit => return c.semaString(c.ast.nodeString(node), nodeId),
+            .raw_string_lit => return c.semaRawString(c.ast.nodeString(node), nodeId),
             .runeLit => {
                 const literal = c.ast.nodeString(node);
                 if (literal.len == 0) {
@@ -4043,7 +4044,7 @@ pub const ChunkExt = struct {
                             const name = c.ast.nodeString(key);
                             c.ir.setArrayItem(irKeysIdx, []const u8, i, name);
                         },
-                        .stringLit => {
+                        .raw_string_lit => {
                             const name = c.ast.nodeString(key);
                             c.ir.setArrayItem(irKeysIdx, []const u8, i, name);
                         },
@@ -4163,8 +4164,7 @@ pub const ChunkExt = struct {
                 if (child.type() == .ident) {
                     const name = c.ast.nodeString(child);
                     if (std.mem.eql(u8, name, "modUri")) {
-                        const irIdx = try c.ir.pushExpr(.string, c.alloc, bt.String, nodeId, .{ .literal = c.srcUri });
-                        return ExprResult.initStatic(irIdx, bt.String);
+                        return c.semaRawString(c.srcUri, nodeId);
                     } else {
                         return c.reportErrorFmt("Compile-time symbol does not exist: {}", &.{v(name)}, node.data.comptimeExpr.child);
                     }
@@ -4328,8 +4328,20 @@ pub const ChunkExt = struct {
         }
     }
 
-    pub fn semaString(c: *cy.Chunk, str: []const u8, nodeId: cy.NodeId) !ExprResult {
-        const irIdx = try c.ir.pushExpr(.string, c.alloc, bt.String, nodeId, .{ .literal = str });
+    pub fn semaString(c: *cy.Chunk, lit: []const u8, nodeId: cy.NodeId) !ExprResult {
+        const raw = try c.unescapeString(lit);
+        if (raw.ptr != lit.ptr) {
+            // Dupe and track in ast.strs.
+            const dupe = try c.alloc.dupe(u8, raw);
+            try c.parser.ast.strs.append(c.alloc, dupe);
+            return c.semaRawString(dupe, nodeId);
+        } else {
+            return c.semaRawString(raw, nodeId);
+        }
+    }
+
+    pub fn semaRawString(c: *cy.Chunk, raw: []const u8, nodeId: cy.NodeId) !ExprResult {
+        const irIdx = try c.ir.pushExpr(.string, c.alloc, bt.String, nodeId, .{ .raw = raw });
         return ExprResult.initStatic(irIdx, bt.String);
     }
 

diff --git a/src/tokenizer.zig b/src/tokenizer.zig
@@ -102,6 +102,7 @@ pub const TokenType = enum(u8) {
     right_bracket,
     right_paren,
     rune,
+    raw_string,
     string,
     struct_k,
     switch_k,
@@ -796,7 +797,7 @@ pub const Tokenizer = struct {
                 } else return t.reportErrorAt("UnterminatedString", &.{}, start);
             }
             if (peek(t) == '\'') {
-                try t.pushSpanToken(.string, start, t.nextPos);
+                try t.pushSpanToken(.raw_string, start, t.nextPos);
                 advance(t);
                 return;
             } else if (peek(t) == '\n') {
@@ -826,7 +827,7 @@ pub const Tokenizer = struct {
                     continue;
                 };
                 if (ch == '\'' and ch2 == '\'') {
-                    try t.pushSpanToken(.string, start, t.nextPos);
+                    try t.pushSpanToken(.raw_string, start, t.nextPos);
                     advance(t);
                     advance(t);
                     advance(t);
@@ -1023,6 +1024,6 @@ test "tokenizer internals." {
     try tt.eq(@alignOf(Token), 4);
     try tt.eq(@sizeOf(TokenizeState), 4);
 
-    try tt.eq(std.enums.values(TokenType).len, 69);
+    try tt.eq(std.enums.values(TokenType).len, 70);
     try tt.eq(keywords.kvs.len, 34);
 }
diff --git a/test/builtins/strings.cy b/test/builtins/strings.cy
@@ -3,6 +3,12 @@ import t 'test'
 var escaped = 'Return the underlying `symbol`.'.replace('`', '\\`')
 t.eq(escaped, 'Return the underlying \\`symbol\\`.')
 
+-- Single quotes does not interpret escape characters.
+var a = '\n'
+t.eq(a.len(), 2)
+a[0] == `\\`
+a[1] == `n`
+
 -- Indexing an invalid utf8.
 var invalid = "\xc3\x28"
 t.eq(invalid.len(), 2)