Skip to content

Commit aa0de78

Browse files
committed
Account for match options during possessification
When deciding on exclusion during auto-possessification, awareness of the current matching options is important for correct analysis. For example, /a+A/ can be auto-possessified, but the case insensitive pattern /(?i)a+A/ cannot be.
1 parent bf9b2a7 commit aa0de78

File tree

5 files changed

+439
-287
lines changed

5 files changed

+439
-287
lines changed
Lines changed: 397 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,397 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// This source file is part of the Swift.org open source project
4+
//
5+
// Copyright (c) 2025 Apple Inc. and the Swift project authors
6+
// Licensed under Apache License v2.0 with Runtime Library Exception
7+
//
8+
// See https://swift.org/LICENSE.txt for license information
9+
//
10+
//===----------------------------------------------------------------------===//
11+
12+
extension DSLList {
13+
private func _requiredAtomImpl(
14+
_ position: inout Int,
15+
options: inout MatchingOptions,
16+
allowOptionsChanges: Bool
17+
) -> DSLTree.Atom?? {
18+
guard position < nodes.count else {
19+
return nil
20+
}
21+
22+
switch nodes[position] {
23+
case .atom(let atom):
24+
switch atom {
25+
case .changeMatchingOptions(let seq):
26+
// Exit early if an atom changes the matching options.
27+
// TODO: Allow some/all options changes.
28+
if allowOptionsChanges {
29+
options.apply(seq.ast)
30+
return nil
31+
} else {
32+
return .some(nil)
33+
}
34+
default:
35+
return atom
36+
}
37+
38+
// In a concatenation, the first definitive child provides the answer,
39+
// and then we need to skip past (in some cases at least) the remaining
40+
// concatenation elements.
41+
case .concatenation(let children):
42+
var result: DSLTree.Atom?? = nil
43+
var i = 0
44+
while i < children.count {
45+
i += 1
46+
position += 1
47+
if let r = _requiredAtomImpl(&position, options: &options, allowOptionsChanges: allowOptionsChanges) {
48+
result = r
49+
break
50+
}
51+
}
52+
53+
for _ in i..<children.count {
54+
position += 1
55+
skipNode(&position)
56+
}
57+
return result
58+
59+
// For a quoted literal, we can look at the first char
60+
// TODO: matching semantics???
61+
case .quotedLiteral(let str):
62+
return str.first.map(DSLTree.Atom.char)
63+
64+
// TODO: custom character classes could/should participate here somehow
65+
case .customCharacterClass:
66+
return .some(nil)
67+
68+
// Trivia/empty have no effect.
69+
case .trivia, .empty:
70+
return nil
71+
72+
// For alternation and conditional, no required first (this could change
73+
// if we identify the _same_ required first atom across all possibilities).
74+
case .orderedChoice, .conditional:
75+
return .some(nil)
76+
77+
// A negative lookahead rules out the existence of a safe required
78+
// character.
79+
case .nonCapturingGroup(let kind, _) where kind.isNegativeLookahead:
80+
return .some(nil)
81+
82+
// Bail out early if this group changes options.
83+
// TODO: Allow some/all options changes.
84+
case .nonCapturingGroup(let kind, _):
85+
position += 1
86+
options.beginScope()
87+
defer { options.endScope() }
88+
switch kind.ast {
89+
case .changeMatchingOptions(let seq) where allowOptionsChanges:
90+
options.apply(seq)
91+
case .changeMatchingOptions:
92+
return .some(nil)
93+
default:
94+
break
95+
}
96+
return _requiredAtomImpl(&position, options: &options, allowOptionsChanges: allowOptionsChanges)
97+
98+
// Groups need to manage option scope.
99+
case .capture:
100+
position += 1
101+
options.beginScope()
102+
defer { options.endScope() }
103+
return _requiredAtomImpl(&position, options: &options, allowOptionsChanges: allowOptionsChanges)
104+
105+
// Other parent nodes defer to the child.
106+
case .ignoreCapturesInTypedOutput,
107+
.limitCaptureNesting:
108+
position += 1
109+
return _requiredAtomImpl(&position, options: &options, allowOptionsChanges: allowOptionsChanges)
110+
111+
// A quantification that doesn't require its child to exist can still
112+
// allow a start-only match. (e.g. `/(foo)?^bar/`)
113+
case .quantification(let amount, _, _):
114+
if amount.requiresAtLeastOne {
115+
position += 1
116+
return _requiredAtomImpl(&position, options: &options, allowOptionsChanges: allowOptionsChanges)
117+
} else {
118+
return .some(nil)
119+
}
120+
121+
// Extended behavior isn't known, so we return `false` for safety.
122+
case .consumer, .matcher, .characterPredicate, .absentFunction:
123+
return .some(nil)
124+
}
125+
}
126+
127+
internal func requiredFirstAtom(allowOptionsChanges: Bool) -> DSLTree.Atom? {
128+
var position = 0
129+
var options = MatchingOptions()
130+
return _requiredAtomImpl(&position, options: &options, allowOptionsChanges: allowOptionsChanges) ?? nil
131+
}
132+
133+
internal mutating func autoPossessifyNextQuantification(
134+
_ position: inout Int,
135+
options: inout MatchingOptions
136+
) -> (Int, DSLTree.Atom)? {
137+
guard position < nodes.count else {
138+
return nil
139+
}
140+
141+
switch nodes[position] {
142+
case .quantification(_, _, _):
143+
let quantPosition = position
144+
position += 1
145+
146+
// Do a search within this quantification's contents
147+
// FIXME: How to handle an inner quantification surfacing here?
148+
var innerPosition = position
149+
_ = autoPossessifyNextQuantification(&innerPosition, options: &options)
150+
151+
switch _requiredAtomImpl(&position, options: &options, allowOptionsChanges: false) {
152+
case .some(let atom?):
153+
return (quantPosition, atom)
154+
case .none, .some(.none):
155+
return nil
156+
}
157+
158+
case .concatenation(let children):
159+
// If we find a valid quantification among this concatenation's components,
160+
// we must look for a required atom in the sibling. If a definitive result
161+
// is not found, pop up the recursion stack to find a sibling at a higher
162+
// level.
163+
var foundQuantification: (Int, DSLTree.Atom)? = nil
164+
var foundNextAtom: DSLTree.Atom? = nil
165+
var i = 0
166+
position += 1
167+
while i < children.count {
168+
i += 1
169+
if let result = autoPossessifyNextQuantification(&position, options: &options) {
170+
foundQuantification = result
171+
break
172+
}
173+
}
174+
175+
while i < children.count {
176+
i += 1
177+
position += 1
178+
if let result = _requiredAtomImpl(&position, options: &options, allowOptionsChanges: false) {
179+
foundNextAtom = result
180+
break
181+
}
182+
}
183+
184+
for _ in i..<children.count {
185+
position += 1
186+
skipNode(&position)
187+
}
188+
189+
guard let (quantIndex, firstAtom) = foundQuantification,
190+
let nextAtom = foundNextAtom
191+
else { return foundQuantification }
192+
193+
// We found a quantifier with a required first atom and a required
194+
// following atom. If the second is excluded by the first, we can
195+
// safely convert the quantifier to possessive.
196+
197+
if firstAtom.excludes(nextAtom, options: options),
198+
case .quantification(let amount, _, let node) = nodes[quantIndex]
199+
{
200+
nodes[quantIndex] = .quantification(amount, .explicit(.possessive), node)
201+
}
202+
203+
return nil
204+
205+
// For alternations, we need to explore / auto-possessify in the different
206+
// branches, but quantifications inside an alternation don't
207+
// auto-possessify with following matching elements outside of the
208+
// alternation (for now, at least).
209+
case .orderedChoice(let children):
210+
position += 1
211+
for _ in 0..<children.count {
212+
_ = autoPossessifyNextQuantification(&position, options: &options)
213+
}
214+
215+
// Same as alternations, just with n = 2
216+
case .conditional:
217+
position += 1
218+
for _ in 0..<2 {
219+
_ = autoPossessifyNextQuantification(&position, options: &options)
220+
}
221+
222+
case .nonCapturingGroup(let kind, _):
223+
position += 1
224+
options.beginScope()
225+
defer { options.endScope() }
226+
227+
if case .changeMatchingOptions(let seq) = kind.ast {
228+
options.apply(seq)
229+
}
230+
return autoPossessifyNextQuantification(&position, options: &options)
231+
232+
case .capture:
233+
position += 1
234+
options.beginScope()
235+
defer { options.endScope() }
236+
237+
return autoPossessifyNextQuantification(&position, options: &options)
238+
239+
case .atom(let atom):
240+
position += 1
241+
switch atom {
242+
case .changeMatchingOptions(let seq):
243+
options.apply(seq.ast)
244+
default: break
245+
}
246+
247+
// All other nodes defer to the child, if present
248+
default:
249+
// Multi-child nodes are handled above, just handle 0 and 1 here.
250+
let childCount = nodes[position].directChildren
251+
position += 1
252+
253+
assert(childCount <= 1)
254+
if childCount == 1 {
255+
return autoPossessifyNextQuantification(&position, options: &options)
256+
}
257+
}
258+
return nil
259+
}
260+
261+
internal mutating func autoPossessify() {
262+
var index = 0
263+
var options = MatchingOptions()
264+
while index < self.nodes.count {
265+
_ = autoPossessifyNextQuantification(&index, options: &options)
266+
}
267+
}
268+
}
269+
270+
extension DSLTree.Atom {
271+
func excludes(_ other: Self, options: MatchingOptions) -> Bool {
272+
switch (self, other) {
273+
case (.char(let a), .char(let b)):
274+
// Two characters are mutually exclusive if one does not match against
275+
// the other.
276+
//
277+
// Relevant options:
278+
// - semantic level
279+
// - case insensitivity
280+
281+
if options.semanticLevel == .graphemeCluster {
282+
// Just call String.match(Character, ...)
283+
let s = String(a)
284+
return nil == s.match(
285+
b, at: s.startIndex,
286+
limitedBy: s.endIndex,
287+
isCaseInsensitive: options.isCaseInsensitive)
288+
} else {
289+
// Call String.matchScalar(Scalar, ...) for each in scalar sequence
290+
let s = String(a)
291+
var i = s.startIndex
292+
var j = b.unicodeScalars.startIndex
293+
while i < s.endIndex {
294+
guard j < b.unicodeScalars.endIndex else { return true }
295+
guard let nextIndex = s.matchScalar(b.unicodeScalars[j], at: i, limitedBy: s.endIndex, boundaryCheck: false, isCaseInsensitive: options.isCaseInsensitive) else {
296+
return true
297+
}
298+
i = nextIndex
299+
b.unicodeScalars.formIndex(after: &j)
300+
}
301+
return false
302+
}
303+
304+
case (.scalar(let a), .scalar(let b)):
305+
// Two scalars are mutually exclusive if one does not match against
306+
// the other.
307+
//
308+
// Relevant options:
309+
// - case insensitivity
310+
let s = String(a)
311+
return nil == s.matchScalar(
312+
b, at: s.startIndex,
313+
limitedBy: s.endIndex,
314+
boundaryCheck: false,
315+
isCaseInsensitive: options.isCaseInsensitive)
316+
317+
case (.characterClass(let a), .characterClass(let b)):
318+
// Certain character classes are mutually exclusive of each other.
319+
return a.excludes(b, options: options)
320+
321+
// For character class and char/scalar, we can test against the class's model.
322+
case (.characterClass(let a), .char(let b)), (.char(let b), .characterClass(let a)):
323+
let s = "\(b)"
324+
return nil == a.asRuntimeModel(options).matches(in: s, at: s.startIndex, limitedBy: s.endIndex)
325+
case (.characterClass(let a), .scalar(let b)), (.scalar(let b), .characterClass(let a)):
326+
let s = "\(b)"
327+
return nil == a.asRuntimeModel(options).matches(in: s, at: s.startIndex, limitedBy: s.endIndex)
328+
329+
default:
330+
return false
331+
}
332+
}
333+
}
334+
335+
extension DSLTree.Atom.CharacterClass {
336+
func excludes(_ other: Self, options: MatchingOptions) -> Bool {
337+
if other == .anyGrapheme || other == .anyUnicodeScalar {
338+
return false
339+
}
340+
341+
return switch self {
342+
case .anyGrapheme, .anyUnicodeScalar:
343+
false
344+
345+
case .digit:
346+
switch other {
347+
case .whitespace, .horizontalWhitespace, .verticalWhitespace, .newlineSequence,
348+
.notWord, .notDigit: true
349+
default: false
350+
}
351+
case .notDigit:
352+
other == .digit
353+
354+
case .horizontalWhitespace:
355+
switch other {
356+
case .word, .digit, .verticalWhitespace, .newlineSequence,
357+
.notWhitespace, .notHorizontalWhitespace: true
358+
default: false
359+
}
360+
case .notHorizontalWhitespace:
361+
other == .horizontalWhitespace
362+
363+
case .newlineSequence:
364+
switch other {
365+
case .word, .digit, .horizontalWhitespace, .notNewline: true
366+
default: false
367+
}
368+
case .notNewline:
369+
other == .newlineSequence
370+
371+
case .whitespace:
372+
switch other {
373+
case .word, .digit, .notWhitespace: true
374+
default: false
375+
}
376+
case .notWhitespace:
377+
other == .whitespace
378+
379+
case .verticalWhitespace:
380+
switch other {
381+
case .word, .digit, .notWhitespace, .notVerticalWhitespace: true
382+
default: false
383+
}
384+
case .notVerticalWhitespace:
385+
other == .verticalWhitespace
386+
387+
case .word:
388+
switch other {
389+
case .whitespace, .horizontalWhitespace, .verticalWhitespace, .newlineSequence,
390+
.notWord: true
391+
default: false
392+
}
393+
case .notWord:
394+
other == .word
395+
}
396+
}
397+
}

0 commit comments

Comments
 (0)