forked from LadybirdBrowser/ladybird
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
LibWeb: Add start of HTML Tokenizer in Swift
Currently it's just a Token class.
- Loading branch information
Showing
5 changed files
with
169 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
/* | ||
* Copyright (c) 2024, Andrew Kaster <[email protected]> | ||
* | ||
* SPDX-License-Identifier: BSD-2-Clause | ||
*/ | ||
|
||
import AK | ||
import LibWeb | ||
import SwiftLibWeb | ||
import Foundation | ||
|
||
class StandardError: TextOutputStream { | ||
func write(_ string: Swift.String) { | ||
try! FileHandle.standardError.write(contentsOf: Data(string.utf8)) | ||
} | ||
} | ||
|
||
@main | ||
struct TestHTMLTokenizerSwift { | ||
|
||
static func testTokenTypes() { | ||
var standardError = StandardError() | ||
print("Testing HTMLToken types...", to: &standardError) | ||
|
||
let default_token = HTMLToken() | ||
default_token.type = .Character(codePoint: "a") | ||
precondition(default_token.isCharacter()) | ||
|
||
print("HTMLToken types pass", to: &standardError) | ||
} | ||
|
||
static func testParserWhitespace() { | ||
var standardError = StandardError() | ||
print("Testing HTMLToken parser whitespace...", to: &standardError) | ||
|
||
for codePoint: Character in ["\t", "\n", "\r", "\u{000C}", " "] { | ||
let token = HTMLToken(type: .Character(codePoint: codePoint)) | ||
precondition(token.isParserWhitespace()) | ||
} | ||
|
||
for codePoint: Character in ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"] { | ||
let token = HTMLToken(type: .Character(codePoint: codePoint)) | ||
precondition(!token.isParserWhitespace()) | ||
} | ||
|
||
print("HTMLToken parser whitespace pass", to: &standardError) | ||
} | ||
|
||
static func main() { | ||
var standardError = StandardError() | ||
print("Starting test suite...", to: &standardError) | ||
|
||
testTokenTypes() | ||
testParserWhitespace() | ||
|
||
print("All tests pass", to: &standardError) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -44,4 +44,4 @@ struct TestLibWebSwiftBindings { | |
|
||
print("All tests pass", to: &standardError) | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
/* | ||
* Copyright (c) 2024, Andrew Kaster <[email protected]>> | ||
* | ||
* SPDX-License-Identifier: BSD-2-Clause | ||
*/ | ||
|
||
public class HTMLToken { | ||
public struct Position { | ||
var line = UInt() | ||
var column = UInt() | ||
var byteOffset = UInt() | ||
} | ||
|
||
public struct Attribute { | ||
var prefix: String? | ||
var localName: String | ||
var namespace_: String? | ||
var value: String | ||
var nameStartPosition: Position | ||
var nameEndPosition: Position | ||
var valueStartPosition: Position | ||
var valueEndPosition: Position | ||
} | ||
|
||
public enum TokenType { | ||
case Invalid | ||
case DOCTYPE( | ||
name: String?, | ||
publicIdentifier: String?, | ||
systemIdentifier: String?, | ||
forceQuirksMode: Bool) | ||
case StartTag( | ||
tagName: String, | ||
selfClosing: Bool, | ||
selfClosingAcknowledged: Bool, | ||
attributes: [Attribute]) | ||
case EndTag( | ||
tagName: String, | ||
selfClosing: Bool, | ||
selfClosingAcknowledged: Bool, | ||
attributes: [Attribute]) | ||
case Comment(data: String) | ||
case Character(codePoint: Character) | ||
case EndOfFile | ||
} | ||
|
||
public func isCharacter() -> Bool { | ||
if case .Character(_) = self.type { | ||
return true | ||
} | ||
return false | ||
} | ||
|
||
public func isParserWhitespace() -> Bool { | ||
precondition(isCharacter(), "isParserWhitespace() called on non-character token") | ||
|
||
// NOTE: The parser considers '\r' to be whitespace, while the tokenizer does not. | ||
switch self.type { | ||
case .Character(codePoint: "\t"), | ||
.Character(codePoint: "\n"), | ||
.Character(codePoint: "\u{000C}"), // \f | ||
.Character(codePoint: "\r"), | ||
.Character(codePoint: " "): | ||
return true | ||
default: | ||
return false | ||
} | ||
} | ||
|
||
public var type = TokenType.Invalid | ||
public var startPosition = Position() | ||
public var endPosition = Position() | ||
|
||
public init() {} | ||
public init(type: TokenType) { | ||
self.type = type | ||
} | ||
} |