Skip to content

Commit

Permalink
Unescape strings during sema pass.
Browse files Browse the repository at this point in the history
  • Loading branch information
fubark committed Mar 9, 2024
1 parent 2502ee7 commit 2bddd40
Show file tree
Hide file tree
Showing 9 changed files with 70 additions and 35 deletions.
19 changes: 13 additions & 6 deletions src/ast.zig
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ pub const NodeType = enum(u8) {
opAssignStmt,
passStmt,
range,
raw_string_lit,
recordLit,
returnExprStmt,
returnStmt,
Expand Down Expand Up @@ -140,7 +141,7 @@ const NodeHead = packed struct {
},
};

/// At most 16 bytes in release mode.
/// At most 8 bytes for ReleaseFast.
const NodeData = union {
uninit: void,
expandOpt: struct {
Expand Down Expand Up @@ -319,7 +320,7 @@ const NodeData = union {
name: NodeId,
typeSpec: NodeId,
},
type_copy_decl: struct {
type_copy_decl: packed struct {
header: NodeId,
func_head: u24,
num_funcs: u8,
Expand Down Expand Up @@ -415,7 +416,7 @@ const NodeData = union {
numParams: u8,
typeDecl: NodeId,
},
range: struct {
range: packed struct {
start: cy.Nullable(NodeId),
end: cy.Nullable(u24),
inc: bool,
Expand Down Expand Up @@ -563,8 +564,8 @@ pub const UnaryOp = enum(u8) {
test "ast internals." {
if (builtin.mode == .ReleaseFast) {
try t.eq(@sizeOf(NodeHead), 4);
try t.eq(@sizeOf(NodeData), 16);
try t.eq(@sizeOf(Node), 24);
try t.eq(@sizeOf(NodeData), 8);
try t.eq(@sizeOf(Node), 16);
} else {
try t.eq(@sizeOf(NodeHead), 4);
try t.eq(@sizeOf(NodeData), 16);
Expand All @@ -583,6 +584,9 @@ pub const Ast = struct {
templateCtNodes: std.ArrayListUnmanaged(NodeId),

/// Heap generated strings, stable pointers unlike `srcGen`.
/// Used for:
/// - Unnamed struct identifiers.
/// - Unescaped strings.
strs: std.ArrayListUnmanaged([]const u8),

/// Optionally parsed by tokenizer.
Expand Down Expand Up @@ -790,7 +794,7 @@ pub const AstView = struct {
const lastName = self.nodeString(last);

var end = last.srcPos + last.data.span.len;
if (last.type() == .stringLit) {
if (last.type() == .raw_string_lit) {
end += 1;
}
return .{
Expand Down Expand Up @@ -1009,6 +1013,9 @@ pub const Encoder = struct {
.ident => {
try w.writeAll(self.ast.nodeString(node));
},
.raw_string_lit => {
try w.writeAll(self.ast.nodeStringAndDelim(node));
},
.stringLit => {
try w.writeAll(self.ast.nodeStringAndDelim(node));
},
Expand Down
3 changes: 1 addition & 2 deletions src/bc_gen.zig
Original file line number Diff line number Diff line change
Expand Up @@ -1136,12 +1136,11 @@ fn genStringTemplate(c: *Chunk, idx: usize, cstr: Cstr, nodeId: cy.NodeId) !GenV

fn genString(c: *Chunk, idx: usize, cstr: Cstr, nodeId: cy.NodeId) !GenValue {
const data = c.ir.getExprData(idx, .string);
const str = try c.unescapeString(data.literal);
const inst = try c.rega.selectForNoErrNoDepInst(cstr, true, nodeId);
if (inst.requiresPreRelease) {
try pushRelease(c, inst.dst, nodeId);
}
try pushStringConst(c, str, inst.dst, nodeId);
try pushStringConst(c, data.raw, inst.dst, nodeId);
return finishNoErrNoDepInst(c, inst, true);
}

Expand Down
23 changes: 10 additions & 13 deletions src/cgen.zig
Original file line number Diff line number Diff line change
Expand Up @@ -1161,7 +1161,7 @@ fn genString(c: *Chunk, loc: usize, cstr: Cstr, nodeId: cy.NodeId) !Value {
// if (inst.requiresPreRelease) {
// try pushRelease(c, inst.dst, nodeId);
// }
const c_lit = try cStringLit(c, data.literal);
const c_lit = try cStringLit(c, data.raw);

try c.bufPushFmt("STRING(\"{s}\")", .{c_lit});

Expand Down Expand Up @@ -1877,12 +1877,9 @@ fn retExprStmt(c: *Chunk, loc: usize, nodeId: cy.NodeId) !void {
// }
}

pub fn cStringLit(self: *Chunk, lit: []const u8) ![]const u8 {
// Big enough to hold unescaped lit and escape c lit.
try self.base.tempBufU8.resize(self.alloc, lit.len * 3);

// First unescape Cyber literal.
const unescaped = try cy.sema.unescapeString(self.base.tempBufU8.items[0..lit.len], lit, false);
pub fn cStringLit(self: *Chunk, raw: []const u8) ![]const u8 {
// Big enough to hold escaped C literal.
try self.base.tempBufU8.resize(self.alloc, raw.len * 2);

// Escape to C literal.
const ReplaceChars = "\\\"";
Expand All @@ -1895,19 +1892,19 @@ pub fn cStringLit(self: *Chunk, lit: []const u8) ![]const u8 {
};
}
};
if (std.mem.indexOfAny(u8, unescaped, ReplaceChars)) |idx| {
var fbuf = std.io.fixedBufferStream(self.base.tempBufU8.items[lit.len..]);
if (std.mem.indexOfAny(u8, raw, ReplaceChars)) |idx| {
var fbuf = std.io.fixedBufferStream(self.base.tempBufU8.items[0..]);
const w = fbuf.writer();
try w.print("{s}{s}", .{unescaped[0..idx], S.replacement(unescaped[idx])});
try w.print("{s}{s}", .{raw[0..idx], S.replacement(raw[idx])});

var rest = unescaped[idx+1..];
var rest = raw[idx+1..];
while (std.mem.indexOfAny(u8, rest, ReplaceChars)) |idx2| {
try w.print("{s}{s}", .{rest[0..idx2], S.replacement(rest[idx2])});
rest = rest[idx2+1..];
}
try w.writeAll(rest);
return fbuf.getWritten();
} else {
return unescaped;
return raw;
}
}
}
6 changes: 5 additions & 1 deletion src/cyon.zig
Original file line number Diff line number Diff line change
Expand Up @@ -449,7 +449,10 @@ pub const DecodeMapIR = struct {
pub fn allocString(self: DecodeMapIR, key: []const u8) ![]const u8 {
if (self.map.get(key)) |val_id| {
const val_n = self.ast.node(val_id);
if (val_n.type() == .stringLit) {
if (val_n.type() == .raw_string_lit) {
const token_s = self.ast.nodeString(val_n);
return try self.alloc.dupe(u8, token_s);
} else if (val_n.type() == .stringLit) {
const token_s = self.ast.nodeString(val_n);
var buf = std.ArrayList(u8).init(self.alloc);
defer buf.deinit();
Expand Down Expand Up @@ -587,6 +590,7 @@ pub const DecodeValueIR = struct {
switch (node.type()) {
.arrayLit => return .list,
.recordLit => return .map,
.raw_string_lit,
.stringLit => return .string,
.hexLit,
.binLit,
Expand Down
2 changes: 1 addition & 1 deletion src/ir.zig
Original file line number Diff line number Diff line change
Expand Up @@ -534,7 +534,7 @@ pub const Array = struct {
};

pub const String = struct {
literal: []const u8,
raw: []const u8,
};

pub const StringTemplate = struct {
Expand Down
13 changes: 11 additions & 2 deletions src/parser.zig
Original file line number Diff line number Diff line change
Expand Up @@ -553,6 +553,10 @@ pub const Parser = struct {
self.advance();
return try self.pushSpanNode(.ident, start);
},
.raw_string => {
self.advance();
return try self.pushSpanNode(.raw_string_lit, start);
},
.string => {
self.advance();
return try self.pushSpanNode(.stringLit, start);
Expand Down Expand Up @@ -1429,7 +1433,7 @@ pub const Parser = struct {
return self.reportError("Expected import specifier.", &.{});
};
const spec_t = self.ast.nodeType(spec);
if (spec_t == .stringLit) {
if (spec_t == .raw_string_lit) {
try self.consumeNewLineOrEnd();
} else {
return self.reportError("Expected import specifier to be a string. {}", &.{fmt.v(spec_t)});
Expand Down Expand Up @@ -2929,6 +2933,10 @@ pub const Parser = struct {
self.advance();
break :b try self.pushSpanNode(.runeLit, start);
},
.raw_string => b: {
self.advance();
break :b try self.pushSpanNode(.raw_string_lit, start);
},
.string => b: {
self.advance();
break :b try self.pushSpanNode(.stringLit, start);
Expand Down Expand Up @@ -3082,6 +3090,7 @@ pub const Parser = struct {
.and_k,
.as_k,
.capture,
.raw_string,
.string,
.bin,
.oct,
Expand Down Expand Up @@ -3837,7 +3846,7 @@ fn isRecedingIndent(p: *Parser, prevIndent: u32, curIndent: u32, indent: u32) !b
fn isRecordKeyNodeType(node_t: cy.NodeType) bool {
switch (node_t) {
.ident,
.stringLit,
.raw_string_lit,
.decLit,
.binLit,
.octLit,
Expand Down
26 changes: 19 additions & 7 deletions src/sema.zig
Original file line number Diff line number Diff line change
Expand Up @@ -614,7 +614,7 @@ pub fn semaStmt(c: *cy.Chunk, nodeId: cy.NodeId) !cy.NodeId {
}

const arg = c.ast.node(expr.data.callExpr.argHead);
if (arg.type() != .stringLit) {
if (arg.type() != .stringLit and arg.type() != .raw_string_lit) {
return c.reportErrorFmt("genLabel expected string arg", &.{}, nodeId);
}

Expand Down Expand Up @@ -2290,7 +2290,7 @@ fn resolveLocalDeclNamePath(c: *cy.Chunk, nameId: cy.NodeId) !DeclNamePathResult
const name = c.ast.nodeString(last);

var end = last.srcPos + name.len;
if (last.type() == .stringLit) {
if (last.type() == .raw_string_lit) {
end += 1;
}
const namePath = c.ast.src[nameN.srcPos..end];
Expand Down Expand Up @@ -3820,6 +3820,7 @@ pub const ChunkExt = struct {
return try semaIdent(c, nodeId, true);
},
.stringLit => return c.semaString(c.ast.nodeString(node), nodeId),
.raw_string_lit => return c.semaRawString(c.ast.nodeString(node), nodeId),
.runeLit => {
const literal = c.ast.nodeString(node);
if (literal.len == 0) {
Expand Down Expand Up @@ -4043,7 +4044,7 @@ pub const ChunkExt = struct {
const name = c.ast.nodeString(key);
c.ir.setArrayItem(irKeysIdx, []const u8, i, name);
},
.stringLit => {
.raw_string_lit => {
const name = c.ast.nodeString(key);
c.ir.setArrayItem(irKeysIdx, []const u8, i, name);
},
Expand Down Expand Up @@ -4163,8 +4164,7 @@ pub const ChunkExt = struct {
if (child.type() == .ident) {
const name = c.ast.nodeString(child);
if (std.mem.eql(u8, name, "modUri")) {
const irIdx = try c.ir.pushExpr(.string, c.alloc, bt.String, nodeId, .{ .literal = c.srcUri });
return ExprResult.initStatic(irIdx, bt.String);
return c.semaRawString(c.srcUri, nodeId);
} else {
return c.reportErrorFmt("Compile-time symbol does not exist: {}", &.{v(name)}, node.data.comptimeExpr.child);
}
Expand Down Expand Up @@ -4328,8 +4328,20 @@ pub const ChunkExt = struct {
}
}

pub fn semaString(c: *cy.Chunk, str: []const u8, nodeId: cy.NodeId) !ExprResult {
const irIdx = try c.ir.pushExpr(.string, c.alloc, bt.String, nodeId, .{ .literal = str });
pub fn semaString(c: *cy.Chunk, lit: []const u8, nodeId: cy.NodeId) !ExprResult {
const raw = try c.unescapeString(lit);
if (raw.ptr != lit.ptr) {
// Dupe and track in ast.strs.
const dupe = try c.alloc.dupe(u8, raw);
try c.parser.ast.strs.append(c.alloc, dupe);
return c.semaRawString(dupe, nodeId);
} else {
return c.semaRawString(raw, nodeId);
}
}

pub fn semaRawString(c: *cy.Chunk, raw: []const u8, nodeId: cy.NodeId) !ExprResult {
const irIdx = try c.ir.pushExpr(.string, c.alloc, bt.String, nodeId, .{ .raw = raw });
return ExprResult.initStatic(irIdx, bt.String);
}

Expand Down
7 changes: 4 additions & 3 deletions src/tokenizer.zig
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ pub const TokenType = enum(u8) {
right_bracket,
right_paren,
rune,
raw_string,
string,
struct_k,
switch_k,
Expand Down Expand Up @@ -796,7 +797,7 @@ pub const Tokenizer = struct {
} else return t.reportErrorAt("UnterminatedString", &.{}, start);
}
if (peek(t) == '\'') {
try t.pushSpanToken(.string, start, t.nextPos);
try t.pushSpanToken(.raw_string, start, t.nextPos);
advance(t);
return;
} else if (peek(t) == '\n') {
Expand Down Expand Up @@ -826,7 +827,7 @@ pub const Tokenizer = struct {
continue;
};
if (ch == '\'' and ch2 == '\'') {
try t.pushSpanToken(.string, start, t.nextPos);
try t.pushSpanToken(.raw_string, start, t.nextPos);
advance(t);
advance(t);
advance(t);
Expand Down Expand Up @@ -1023,6 +1024,6 @@ test "tokenizer internals." {
try tt.eq(@alignOf(Token), 4);
try tt.eq(@sizeOf(TokenizeState), 4);

try tt.eq(std.enums.values(TokenType).len, 69);
try tt.eq(std.enums.values(TokenType).len, 70);
try tt.eq(keywords.kvs.len, 34);
}
6 changes: 6 additions & 0 deletions test/builtins/strings.cy
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,12 @@ import t 'test'
var escaped = 'Return the underlying `symbol`.'.replace('`', '\\`')
t.eq(escaped, 'Return the underlying \\`symbol\\`.')

-- Single quotes does not interpret escape characters.
var a = '\n'
t.eq(a.len(), 2)
a[0] == `\\`
a[1] == `n`

-- Indexing an invalid utf8.
var invalid = "\xc3\x28"
t.eq(invalid.len(), 2)
Expand Down

0 comments on commit 2bddd40

Please sign in to comment.