Skip to content

Commit

Permalink
[BigString] Fix accidentally quadratic BigString.init
Browse files Browse the repository at this point in the history
When ingesting a `String` instance, `BigString` assumes that the input string has a reasonably efficient UTF-8 view.

Unfortunately, that is very much not the case when the input happens to be backed by a bridged NSString object — it appears that in this case, the ingester loop invokes some operation(s) with linear complexity in the size of the entire input, rendering the ingester’s overall complexity quadratic.

The BigString ingester is only expected to operate within a single chunk at the time. It’s unclear precisely which operation triggers the quadratic behavior; ideally we should figure it out and resolve it with a more targeted fix.

In the meantime, a blunt stopgap fix is to force-transcode the input string to UTF-8 at the time the ingester is initialized. This unnecessarily wastes some (temporary) memory on holding the transcoded string, but it avoids the quadratic cliff.
  • Loading branch information
lorentey committed Jul 15, 2024
1 parent 3d2dc41 commit 5e1fe6e
Show file tree
Hide file tree
Showing 3 changed files with 118 additions and 3 deletions.
108 changes: 108 additions & 0 deletions Benchmarks/Sources/Benchmarks/BigStringBenchmarks.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
//===----------------------------------------------------------------------===//
//
// This source file is part of the Swift Collections open source project
//
// Copyright (c) 2024 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
//
//===----------------------------------------------------------------------===//

import CollectionsBenchmark
import _RopeModule
import Foundation

let someLatinSymbols: [UnicodeScalar] = [
0x20 ..< 0x7f,
0xa1 ..< 0xad,
0xae ..< 0x2af,
0x300 ..< 0x370,
0x1e00 ..< 0x1eff,
].flatMap {
$0.map { UnicodeScalar($0)! }
}

extension UnicodeScalar {
static func randomLatin(
using rng: inout some RandomNumberGenerator
) -> Self {
someLatinSymbols.randomElement(using: &rng)!
}
}

extension String.UnicodeScalarView {
static func randomLatin(
runeCount: Int, using rng: inout some RandomNumberGenerator
) -> Self {
var result = String.UnicodeScalarView()
for _ in 0 ..< runeCount {
result.append(UnicodeScalar.randomLatin(using: &rng))
}
return result
}
}

extension String {
static func randomLatin(
runeCount: Int, using rng: inout some RandomNumberGenerator
) -> Self {
let text = String.UnicodeScalarView.randomLatin(
runeCount: runeCount, using: &rng)
return String(text)
}
}

struct NativeStringInput {
let value: String

init(runeCount: Int, using rng: inout some RandomNumberGenerator) {
self.value = String.randomLatin(runeCount: runeCount, using: &rng)
}
}

struct BridgedStringInput {
let value: String

init(runeCount: Int, using rng: inout some RandomNumberGenerator) {
let string = String.randomLatin(runeCount: runeCount, using: &rng)
let utf16 = Array(string.utf16)
let cocoa = utf16.withUnsafeBufferPointer {
NSString(characters: $0.baseAddress!, length: $0.count)
}
self.value = cocoa as String
}
}


extension Benchmark {
public mutating func addBigStringBenchmarks() {
guard #available(macOS 13.3, iOS 16.4, watchOS 9.4, tvOS 16.4, *) else {
return
}

self.registerInputGenerator(for: NativeStringInput.self) { c in
var rng = SystemRandomNumberGenerator()
return NativeStringInput(runeCount: c, using: &rng)
}

self.registerInputGenerator(for: BridgedStringInput.self) { c in
var rng = SystemRandomNumberGenerator()
return BridgedStringInput(runeCount: c, using: &rng)
}

self.addSimple(
title: "BigString init from native string",
input: NativeStringInput.self
) { input in
blackHole(BigString(input.value))
}

self.addSimple(
title: "BigString init from bridged string",
input: BridgedStringInput.self
) { input in
blackHole(BigString(input.value))
}
}
}
1 change: 1 addition & 0 deletions Benchmarks/Sources/benchmark-tool/main.swift
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ benchmark.addHeapBenchmarks()
benchmark.addBitSetBenchmarks()
benchmark.addTreeSetBenchmarks()
benchmark.addCppBenchmarks()
benchmark.addBigStringBenchmarks()
#if os(macOS) || os(iOS) || os(watchOS) || os(tvOS)
benchmark.addFoundationBenchmarks()
#endif
Expand Down
12 changes: 9 additions & 3 deletions Sources/RopeModule/BigString/Basics/BigString+Ingester.swift
Original file line number Diff line number Diff line change
Expand Up @@ -39,13 +39,19 @@ extension BigString {
var state: _CharacterRecognizer

init(_ input: Substring) {
self.input = input
self.start = input.startIndex
self.state = _CharacterRecognizer()
self.init(input, startState: _CharacterRecognizer())
}

init(_ input: Substring, startState: __owned _CharacterRecognizer) {
self.input = input
// Prevent accidentally quadratic operation by ensuring that we have
// a native UTF-8 string.
// FIXME: This is wasteful: if `input` happens to be a bridged
// FIXME: NSString instance, then it temporarily allocates a full
// FIXME: copy of the (transcoded) input string, only to then copy
// FIXME: its pieces into the tree later.
// FIXME: We should have a direct ingester path for native UTF-16 data.
self.input.makeContiguousUTF8()
self.start = input.startIndex
self.state = startState
}
Expand Down

0 comments on commit 5e1fe6e

Please sign in to comment.