-
Notifications
You must be signed in to change notification settings - Fork 207
/
Copy pathpcre2_substitute.c
1668 lines (1373 loc) · 49.6 KB
/
pcre2_substitute.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*************************************************
* Perl-Compatible Regular Expressions *
*************************************************/
/* PCRE is a library of functions to support regular expressions whose syntax
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge
New API code Copyright (c) 2016-2024 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the University of Cambridge nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
-----------------------------------------------------------------------------
*/
#include "pcre2_internal.h"
#define PTR_STACK_SIZE 20
#define SUBSTITUTE_OPTIONS \
(PCRE2_SUBSTITUTE_EXTENDED|PCRE2_SUBSTITUTE_GLOBAL| \
PCRE2_SUBSTITUTE_LITERAL|PCRE2_SUBSTITUTE_MATCHED| \
PCRE2_SUBSTITUTE_OVERFLOW_LENGTH|PCRE2_SUBSTITUTE_REPLACEMENT_ONLY| \
PCRE2_SUBSTITUTE_UNKNOWN_UNSET|PCRE2_SUBSTITUTE_UNSET_EMPTY)
/*************************************************
* Find end of substitute text *
*************************************************/
/* In extended mode, we recognize ${name:+set text:unset text} and similar
constructions. This requires the identification of unescaped : and }
characters. This function scans for such. It must deal with nested ${
constructions. The pointer to the text is updated, either to the required end
character, or to where an error was detected.
Arguments:
code points to the compiled expression (for options)
ptrptr points to the pointer to the start of the text (updated)
ptrend end of the whole string
last TRUE if the last expected string (only } recognized)
Returns: 0 on success
negative error code on failure
*/
static int
find_text_end(const pcre2_code *code, PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend,
BOOL last)
{
int rc = 0;
uint32_t nestlevel = 0;
BOOL literal = FALSE;
PCRE2_SPTR ptr = *ptrptr;
for (; ptr < ptrend; ptr++)
{
if (literal)
{
if (ptr[0] == CHAR_BACKSLASH && ptr < ptrend - 1 && ptr[1] == CHAR_E)
{
literal = FALSE;
ptr += 1;
}
}
else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
{
if (nestlevel == 0) goto EXIT;
nestlevel--;
}
else if (*ptr == CHAR_COLON && !last && nestlevel == 0) goto EXIT;
else if (*ptr == CHAR_DOLLAR_SIGN)
{
if (ptr < ptrend - 1 && ptr[1] == CHAR_LEFT_CURLY_BRACKET)
{
nestlevel++;
ptr += 1;
}
}
else if (*ptr == CHAR_BACKSLASH)
{
int erc;
int errorcode;
uint32_t ch;
if (ptr < ptrend - 1) switch (ptr[1])
{
case CHAR_L:
case CHAR_l:
case CHAR_U:
case CHAR_u:
ptr += 1;
continue;
}
ptr += 1; /* Must point after \ */
erc = PRIV(check_escape)(&ptr, ptrend, &ch, &errorcode,
code->overall_options, code->extra_options, code->top_bracket, FALSE, NULL);
ptr -= 1; /* Back to last code unit of escape */
if (errorcode != 0)
{
/* errorcode from check_escape is positive, so must not be returned by
pcre2_substitute(). */
rc = PCRE2_ERROR_BADREPESCAPE;
goto EXIT;
}
switch(erc)
{
case 0: /* Data character */
case ESC_b: /* Data character */
case ESC_v: /* Data character */
case ESC_E: /* Isolated \E is ignored */
break;
case ESC_Q:
literal = TRUE;
break;
case ESC_g:
/* The \g<name> form (\g<number> already handled by check_escape)
Don't worry about finding the matching ">". We are super, super lenient
about validating ${} replacements inside find_text_end(), so we certainly
don't need to worry about other syntax. Importantly, a \g<..> or $<...>
sequence can't contain a '}' character. */
break;
default:
if (erc < 0)
break; /* capture group reference */
rc = PCRE2_ERROR_BADREPESCAPE;
goto EXIT;
}
}
}
rc = PCRE2_ERROR_REPMISSINGBRACE; /* Terminator not found */
EXIT:
*ptrptr = ptr;
return rc;
}
/*************************************************
* Validate group name *
*************************************************/
/* This function scans for a capture group name, validating it
consists of legal characters, is not empty, and does not exceed
MAX_NAME_SIZE.
Arguments:
ptrptr points to the pointer to the start of the text (updated)
ptrend end of the whole string
utf true if the input is UTF-encoded
ctypes pointer to the character types table
Returns: TRUE if a name was read
FALSE otherwise
*/
static BOOL
read_name_subst(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, BOOL utf,
const uint8_t* ctypes)
{
PCRE2_SPTR ptr = *ptrptr;
PCRE2_SPTR nameptr = ptr;
if (ptr >= ptrend) /* No characters in name */
goto FAILED;
/* We do not need to check whether the name starts with a non-digit.
We are simply referencing names here, not defining them. */
/* See read_name in the pcre2_compile.c for the corresponding logic
restricting group names inside the pattern itself. */
#ifdef SUPPORT_UNICODE
if (utf)
{
uint32_t c, type;
while (ptr < ptrend)
{
GETCHAR(c, ptr);
type = UCD_CHARTYPE(c);
if (type != ucp_Nd && PRIV(ucp_gentype)[type] != ucp_L &&
c != CHAR_UNDERSCORE) break;
ptr++;
FORWARDCHARTEST(ptr, ptrend);
}
}
else
#else
(void)utf; /* Avoid compiler warning */
#endif /* SUPPORT_UNICODE */
/* Handle group names in non-UTF modes. */
{
while (ptr < ptrend && MAX_255(*ptr) && (ctypes[*ptr] & ctype_word) != 0)
{
ptr++;
}
}
/* Check name length */
if (ptr - nameptr > MAX_NAME_SIZE)
goto FAILED;
/* Subpattern names must not be empty */
if (ptr == nameptr)
goto FAILED;
*ptrptr = ptr;
return TRUE;
FAILED:
*ptrptr = ptr;
return FALSE;
}
/*************************************************
* Case transformations *
*************************************************/
#define PCRE2_SUBSTITUTE_CASE_NONE 0
// 1, 2, 3 are PCRE2_SUBSTITUTE_CASE_LOWER, UPPER, TITLE_FIRST.
#define PCRE2_SUBSTITUTE_CASE_REVERSE_TITLE_FIRST 4
typedef struct {
int to_case; /* One of PCRE2_SUBSTITUTE_CASE_xyz */
BOOL single_char;
} case_state;
/* Helper to guess how much a string is likely to increase in size when
case-transformed. Usually, strings don't change size at all, but some rare
characters do grow. Estimate +10%, plus another few characters.
Performing this estimation is unfortunate, but inevitable, since we can't call
the callout if we ran out of buffer space to prepare its input.
Because this estimate is inexact (and in pathological cases, underestimates the
required buffer size) we must document that when you have a
substitute_case_callout, and you are using PCRE2_SUBSTITUTE_OVERFLOW_LENGTH, you
may need more than two calls to determine the final buffer size. */
static PCRE2_SIZE
pessimistic_case_inflation(PCRE2_SIZE len)
{
return (len >> 3u) + 10;
}
/* Case transformation behaviour if no callout is passed. */
static PCRE2_SIZE
default_substitute_case_callout(
PCRE2_SPTR input, PCRE2_SIZE input_len,
PCRE2_UCHAR *output, PCRE2_SIZE output_cap,
case_state *state, const pcre2_code *code)
{
PCRE2_SPTR input_end = input + input_len;
#ifdef SUPPORT_UNICODE
BOOL utf;
BOOL ucp;
#endif
PCRE2_UCHAR temp[6];
BOOL next_to_upper;
BOOL rest_to_upper;
BOOL single_char;
BOOL overflow = FALSE;
PCRE2_SIZE written = 0;
/* Helpful simplifying invariant: input and output are disjoint buffers.
I believe that this code is technically undefined behaviour, because the two
pointers input/output are "unrelated" pointers and hence not comparable. Casting
via char* bypasses some but not all of those technical rules. It is not included
in release builds, in any case. */
PCRE2_ASSERT((char *)(input + input_len) <= (char *)output ||
(char *)(output + output_cap) <= (char *)input);
#ifdef SUPPORT_UNICODE
utf = (code->overall_options & PCRE2_UTF) != 0;
ucp = (code->overall_options & PCRE2_UCP) != 0;
#endif
if (input_len == 0) return 0;
switch (state->to_case)
{
default:
PCRE2_DEBUG_UNREACHABLE();
return 0;
case PCRE2_SUBSTITUTE_CASE_LOWER: // Can be single_char TRUE or FALSE
case PCRE2_SUBSTITUTE_CASE_UPPER: // Can only be single_char FALSE
next_to_upper = rest_to_upper = (state->to_case == PCRE2_SUBSTITUTE_CASE_UPPER);
break;
case PCRE2_SUBSTITUTE_CASE_TITLE_FIRST: // Can be single_char TRUE or FALSE
next_to_upper = TRUE;
rest_to_upper = FALSE;
state->to_case = PCRE2_SUBSTITUTE_CASE_LOWER;
break;
case PCRE2_SUBSTITUTE_CASE_REVERSE_TITLE_FIRST: // Can only be single_char FALSE
next_to_upper = FALSE;
rest_to_upper = TRUE;
state->to_case = PCRE2_SUBSTITUTE_CASE_UPPER;
break;
}
single_char = state->single_char;
if (single_char)
state->to_case = PCRE2_SUBSTITUTE_CASE_NONE;
while (input < input_end)
{
uint32_t ch;
unsigned int chlen;
GETCHARINCTEST(ch, input);
#ifdef SUPPORT_UNICODE
if ((utf || ucp) && ch >= 128)
{
uint32_t type = UCD_CHARTYPE(ch);
if (PRIV(ucp_gentype)[type] == ucp_L &&
type != (next_to_upper? ucp_Lu : ucp_Ll))
ch = UCD_OTHERCASE(ch);
/* TODO This is far from correct... it doesn't support the SpecialCasing.txt
mappings, but worse, it's not even correct for all the ordinary case
mappings. We should add support for those (at least), and then add the
SpecialCasing.txt mappings for Esszet and ligatures, and finally use the
Turkish casing flag on the match context. */
}
else
#endif
if (MAX_255(ch))
{
if (((code->tables + cbits_offset +
(next_to_upper? cbit_upper:cbit_lower)
)[ch/8] & (1u << (ch%8))) == 0)
ch = (code->tables + fcc_offset)[ch];
}
#ifdef SUPPORT_UNICODE
if (utf) chlen = PRIV(ord2utf)(ch, temp); else
#endif
{
temp[0] = ch;
chlen = 1;
}
if (!overflow && chlen <= output_cap)
{
memcpy(output, temp, CU2BYTES(chlen));
output += chlen;
output_cap -= chlen;
}
else
{
overflow = TRUE;
}
if (chlen > ~(PCRE2_SIZE)0 - written) /* Integer overflow */
return ~(PCRE2_SIZE)0;
written += chlen;
next_to_upper = rest_to_upper;
/* memcpy the remainder, if only transforming a single character. */
if (single_char)
{
PCRE2_SIZE rest_len = input_end - input;
if (!overflow && rest_len <= output_cap)
memcpy(output, input, CU2BYTES(rest_len));
if (rest_len > ~(PCRE2_SIZE)0 - written) /* Integer overflow */
return ~(PCRE2_SIZE)0;
written += rest_len;
return written;
}
}
return written;
}
/* Helper to perform the call to the substitute_case_callout. We wrap the
user-provided callout because our internal arguments are slightly extended. We
don't want the user callout to handle the case of "\l" (first character only to
lowercase) or "\l\U" (first character to lowercase, rest to uppercase) because
those are not operations defined by Unicode. Instead the user callout simply
needs to provide the three Unicode primitives: lower, upper, titlecase. */
static PCRE2_SIZE
do_case_copy(
PCRE2_UCHAR *input_output, PCRE2_SIZE input_len, PCRE2_SIZE output_cap,
case_state *state, BOOL utf,
PCRE2_SIZE (*substitute_case_callout)(PCRE2_SPTR, PCRE2_SIZE, PCRE2_UCHAR *,
PCRE2_SIZE, int, void *),
void *substitute_case_callout_data)
{
PCRE2_SPTR input = input_output;
PCRE2_UCHAR *output = input_output;
PCRE2_SIZE rc;
PCRE2_SIZE rc2;
int ch1_to_case;
int rest_to_case;
PCRE2_UCHAR ch1[6];
PCRE2_SIZE ch1_len;
PCRE2_SPTR rest;
PCRE2_SIZE rest_len;
BOOL ch1_overflow = FALSE;
BOOL rest_overflow = FALSE;
#if PCRE2_CODE_UNIT_WIDTH == 32 || !defined(SUPPORT_UNICODE)
(void)utf; /* Avoid compiler warning. */
#endif
PCRE2_ASSERT(input_len != 0);
switch (state->to_case)
{
default:
PCRE2_DEBUG_UNREACHABLE();
return 0;
case PCRE2_SUBSTITUTE_CASE_LOWER: // Can be single_char TRUE or FALSE
case PCRE2_SUBSTITUTE_CASE_UPPER: // Can only be single_char FALSE
case PCRE2_SUBSTITUTE_CASE_TITLE_FIRST: // Can be single_char TRUE or FALSE
/* The easy case, where our internal casing operations align with those of
the callout. */
if (state->single_char == FALSE)
{
rc = substitute_case_callout(input, input_len, output, output_cap,
state->to_case, substitute_case_callout_data);
if (state->to_case == PCRE2_SUBSTITUTE_CASE_TITLE_FIRST)
state->to_case = PCRE2_SUBSTITUTE_CASE_LOWER;
return rc;
}
ch1_to_case = state->to_case;
rest_to_case = PCRE2_SUBSTITUTE_CASE_NONE;
break;
case PCRE2_SUBSTITUTE_CASE_REVERSE_TITLE_FIRST: // Can only be single_char FALSE
ch1_to_case = PCRE2_SUBSTITUTE_CASE_LOWER;
rest_to_case = PCRE2_SUBSTITUTE_CASE_UPPER;
break;
}
/* Identify the leading character. Take copy, because its storage overlaps with
`output`, and hence may be scrambled by the callout. */
{
PCRE2_SPTR ch_end = input;
uint32_t ch;
GETCHARINCTEST(ch, ch_end);
(void) ch;
PCRE2_ASSERT(ch_end <= input + input_len && ch_end - input <= 6);
ch1_len = ch_end - input;
memcpy(ch1, input, CU2BYTES(ch1_len));
}
rest = input + ch1_len;
rest_len = input_len - ch1_len;
/* Transform just ch1. The buffers are always in-place (input == output). With a
custom callout, we need a loop to discover its required buffer size. The loop
wouldn't be required if the callout were well-behaved, but it might be naughty
and return "5" the first time, then "10" the next time we call it using the
exact same input! */
{
PCRE2_SIZE ch1_cap;
PCRE2_SIZE max_ch1_cap;
ch1_cap = ch1_len; /* First attempt uses the space vacated by ch1. */
PCRE2_ASSERT(output_cap >= input_len && input_len >= rest_len);
max_ch1_cap = output_cap - rest_len;
while (TRUE)
{
rc = substitute_case_callout(ch1, ch1_len, output, ch1_cap, ch1_to_case,
substitute_case_callout_data);
if (rc == ~(PCRE2_SIZE)0) return rc;
if (rc <= ch1_cap) break;
if (rc > max_ch1_cap)
{
ch1_overflow = TRUE;
break;
}
/* Move the rest to the right, to make room for expanding ch1. */
memmove(input_output + rc, rest, CU2BYTES(rest_len));
rest = input + rc;
ch1_cap = rc;
/* Proof of loop termination: `ch1_cap` is growing on each iteration, but
the loop ends if `rc` reaches the (unchanging) upper bound of output_cap. */
}
}
if (rest_to_case == PCRE2_SUBSTITUTE_CASE_NONE)
{
if (!ch1_overflow)
{
PCRE2_ASSERT(rest_len <= output_cap - rc);
memmove(output + rc, rest, CU2BYTES(rest_len));
}
rc2 = rest_len;
state->to_case = PCRE2_SUBSTITUTE_CASE_NONE;
}
else
{
PCRE2_UCHAR dummy[1];
rc2 = substitute_case_callout(rest, rest_len,
ch1_overflow? dummy : output + rc,
ch1_overflow? 0u : output_cap - rc,
rest_to_case, substitute_case_callout_data);
if (rc2 == ~(PCRE2_SIZE)0) return rc2;
if (!ch1_overflow && rc2 > output_cap - rc) rest_overflow = TRUE;
/* If ch1 grows so that `xform(ch1)+rest` can't fit in the buffer, but then
`rest` shrinks, it's actually possible for the total calculated length of
`xform(ch1)+xform(rest)` to come out at less than output_cap. But we can't
report that, because it would make it seem that the operation succeeded.
If either of xform(ch1) or xform(rest) won't fit in the buffer, our final
result must be > output_cap. */
if (ch1_overflow && rc2 < rest_len)
rc2 = rest_len;
state->to_case = PCRE2_SUBSTITUTE_CASE_UPPER;
}
if (rc2 > ~(PCRE2_SIZE)0 - rc) /* Integer overflow */
return ~(PCRE2_SIZE)0;
PCRE2_ASSERT(!(ch1_overflow || rest_overflow) || rc + rc2 > output_cap);
(void)rest_overflow;
return rc + rc2;
}
/*************************************************
* Match and substitute *
*************************************************/
/* This function applies a compiled re to a subject string and creates a new
string with substitutions. The first 7 arguments are the same as for
pcre2_match(). Either string length may be PCRE2_ZERO_TERMINATED.
Arguments:
code points to the compiled expression
subject points to the subject string
length length of subject string (may contain binary zeros)
start_offset where to start in the subject string
options option bits
match_data points to a match_data block, or is NULL
context points a PCRE2 context
replacement points to the replacement string
rlength length of replacement string
buffer where to put the substituted string
blength points to length of buffer; updated to length of string
Returns: >= 0 number of substitutions made
< 0 an error code
PCRE2_ERROR_BADREPLACEMENT means invalid use of $
*/
/* This macro checks for space in the buffer before copying into it. On
overflow, either give an error immediately, or keep on, accumulating the
length. */
#define CHECKMEMCPY(from, length_) \
do { \
PCRE2_SIZE chkmc_length = length_; \
if (overflowed) \
{ \
if (chkmc_length > ~(PCRE2_SIZE)0 - extra_needed) /* Integer overflow */ \
goto TOOLARGEREPLACE; \
extra_needed += chkmc_length; \
} \
else if (lengthleft < chkmc_length) \
{ \
if ((suboptions & PCRE2_SUBSTITUTE_OVERFLOW_LENGTH) == 0) goto NOROOM; \
overflowed = TRUE; \
extra_needed = chkmc_length - lengthleft; \
} \
else \
{ \
memcpy(buffer + buff_offset, from, CU2BYTES(chkmc_length)); \
buff_offset += chkmc_length; \
lengthleft -= chkmc_length; \
} \
} \
while (0)
/* This macro checks for space and copies characters with casing modifications.
On overflow, it behaves as for CHECKMEMCPY().
When substitute_case_callout is NULL, the source and destination buffers must
not overlap, because our default handler does not support this. */
#define CHECKCASECPY_BASE(length_, do_call) \
do { \
PCRE2_SIZE chkcc_length = (PCRE2_SIZE)(length_); \
PCRE2_SIZE chkcc_rc; \
do_call \
if (lengthleft < chkcc_rc) \
{ \
if ((suboptions & PCRE2_SUBSTITUTE_OVERFLOW_LENGTH) == 0) goto NOROOM; \
overflowed = TRUE; \
extra_needed = chkcc_rc - lengthleft; \
} \
else \
{ \
buff_offset += chkcc_rc; \
lengthleft -= chkcc_rc; \
} \
} \
while (0)
#define CHECKCASECPY_DEFAULT(from, length_) \
CHECKCASECPY_BASE(length_, { \
chkcc_rc = default_substitute_case_callout(from, chkcc_length, \
buffer + buff_offset, \
overflowed? 0 : lengthleft, \
&forcecase, code); \
if (overflowed) \
{ \
if (chkcc_rc > ~(PCRE2_SIZE)0 - extra_needed) /* Integer overflow */ \
goto TOOLARGEREPLACE; \
extra_needed += chkcc_rc; \
break; \
} \
})
#define CHECKCASECPY_CALLOUT(length_) \
CHECKCASECPY_BASE(length_, { \
chkcc_rc = do_case_copy(buffer + buff_offset, chkcc_length, \
lengthleft, &forcecase, utf, \
substitute_case_callout, \
substitute_case_callout_data); \
if (chkcc_rc == ~(PCRE2_SIZE)0) goto CASEERROR; \
})
/* This macro does a delayed case transformation, for the situation when we have
a case-forcing callout. */
#define DELAYEDFORCECASE() \
do { \
PCRE2_SIZE chars_outstanding = (buff_offset - casestart_offset) + \
(extra_needed - casestart_extra_needed); \
if (chars_outstanding > 0) \
{ \
if (overflowed) \
{ \
PCRE2_SIZE guess = pessimistic_case_inflation(chars_outstanding); \
if (guess > ~(PCRE2_SIZE)0 - extra_needed) /* Integer overflow */ \
goto TOOLARGEREPLACE; \
extra_needed += guess; \
} \
else \
{ \
/* Rewind the buffer */ \
lengthleft += (buff_offset - casestart_offset); \
buff_offset = casestart_offset; \
/* Care! In-place case transformation */ \
CHECKCASECPY_CALLOUT(chars_outstanding); \
} \
} \
} \
while (0)
/* Here's the function */
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_substitute(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length,
PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data,
pcre2_match_context *mcontext, PCRE2_SPTR replacement, PCRE2_SIZE rlength,
PCRE2_UCHAR *buffer, PCRE2_SIZE *blength)
{
int rc;
int subs;
uint32_t ovector_count;
uint32_t goptions = 0;
uint32_t suboptions;
pcre2_match_data *internal_match_data = NULL;
BOOL escaped_literal = FALSE;
BOOL overflowed = FALSE;
BOOL use_existing_match;
BOOL replacement_only;
BOOL utf = (code->overall_options & PCRE2_UTF) != 0;
PCRE2_UCHAR temp[6];
PCRE2_UCHAR null_str[1] = { 0xcd };
PCRE2_SPTR ptr;
PCRE2_SPTR repend = NULL;
PCRE2_SIZE extra_needed = 0;
PCRE2_SIZE buff_offset, buff_length, lengthleft, fraglength;
PCRE2_SIZE *ovector;
PCRE2_SIZE ovecsave[2] = { 0, 0 };
pcre2_substitute_callout_block scb;
PCRE2_SIZE sub_start_extra_needed;
PCRE2_SIZE (*substitute_case_callout)(PCRE2_SPTR, PCRE2_SIZE, PCRE2_UCHAR *,
PCRE2_SIZE, int, void *) = NULL;
void *substitute_case_callout_data = NULL;
/* General initialization */
buff_offset = 0;
lengthleft = buff_length = *blength;
*blength = PCRE2_UNSET;
if (mcontext != NULL)
{
substitute_case_callout = mcontext->substitute_case_callout;
substitute_case_callout_data = mcontext->substitute_case_callout_data;
}
/* Partial matching is not valid. This must come after setting *blength to
PCRE2_UNSET, so as not to imply an offset in the replacement. */
if ((options & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0)
return PCRE2_ERROR_BADOPTION;
/* Validate length and find the end of the replacement. A NULL replacement of
zero length is interpreted as an empty string. */
if (replacement == NULL)
{
if (rlength != 0) return PCRE2_ERROR_NULL;
replacement = null_str;
}
if (rlength == PCRE2_ZERO_TERMINATED) rlength = PRIV(strlen)(replacement);
repend = replacement + rlength;
/* Check for using a match that has already happened. Note that the subject
pointer in the match data may be NULL after a no-match. */
use_existing_match = ((options & PCRE2_SUBSTITUTE_MATCHED) != 0);
replacement_only = ((options & PCRE2_SUBSTITUTE_REPLACEMENT_ONLY) != 0);
/* If starting from an existing match, there must be an externally provided
match data block. We create an internal match_data block in two cases: (a) an
external one is not supplied (and we are not starting from an existing match);
(b) an existing match is to be used for the first substitution. In the latter
case, we copy the existing match into the internal block, except for any cached
heap frame size and pointer. This ensures that no changes are made to the
external match data block. */
/* WARNING: In both cases below a general context is constructed "by hand"
because calling pcre2_general_context_create() involves a memory allocation. If
the contents of a general context control block are ever changed there will
have to be changes below. */
if (match_data == NULL)
{
pcre2_general_context gcontext;
if (use_existing_match) return PCRE2_ERROR_NULL;
gcontext.memctl = (mcontext == NULL)?
((pcre2_real_code *)code)->memctl :
((pcre2_real_match_context *)mcontext)->memctl;
match_data = internal_match_data =
pcre2_match_data_create_from_pattern(code, &gcontext);
if (internal_match_data == NULL) return PCRE2_ERROR_NOMEMORY;
}
else if (use_existing_match)
{
int pairs;
pcre2_general_context gcontext;
gcontext.memctl = (mcontext == NULL)?
((pcre2_real_code *)code)->memctl :
((pcre2_real_match_context *)mcontext)->memctl;
pairs = (code->top_bracket + 1 < match_data->oveccount)?
code->top_bracket + 1 : match_data->oveccount;
internal_match_data = pcre2_match_data_create(match_data->oveccount,
&gcontext);
if (internal_match_data == NULL) return PCRE2_ERROR_NOMEMORY;
memcpy(internal_match_data, match_data, offsetof(pcre2_match_data, ovector)
+ 2*pairs*sizeof(PCRE2_SIZE));
internal_match_data->heapframes = NULL;
internal_match_data->heapframes_size = 0;
match_data = internal_match_data;
}
/* Remember ovector details */
ovector = pcre2_get_ovector_pointer(match_data);
ovector_count = pcre2_get_ovector_count(match_data);
/* Fixed things in the callout block */
scb.version = 0;
scb.input = subject;
scb.output = (PCRE2_SPTR)buffer;
scb.ovector = ovector;
/* A NULL subject of zero length is treated as an empty string. */
if (subject == NULL)
{
if (length != 0) return PCRE2_ERROR_NULL;
subject = null_str;
}
/* Find length of zero-terminated subject */
if (length == PCRE2_ZERO_TERMINATED)
length = subject? PRIV(strlen)(subject) : 0;
/* Check UTF replacement string if necessary. */
#ifdef SUPPORT_UNICODE
if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
{
rc = PRIV(valid_utf)(replacement, rlength, &(match_data->startchar));
if (rc != 0)
{
match_data->leftchar = 0;
goto EXIT;
}
}
#endif /* SUPPORT_UNICODE */
/* Save the substitute options and remove them from the match options. */
suboptions = options & SUBSTITUTE_OPTIONS;
options &= ~SUBSTITUTE_OPTIONS;
/* Error if the start match offset is greater than the length of the subject. */
if (start_offset > length)
{
match_data->leftchar = 0;
rc = PCRE2_ERROR_BADOFFSET;
goto EXIT;
}
/* Copy up to the start offset, unless only the replacement is required. */
if (!replacement_only) CHECKMEMCPY(subject, start_offset);
/* Loop for global substituting. If PCRE2_SUBSTITUTE_MATCHED is set, the first
match is taken from the match_data that was passed in. */
subs = 0;
for (;;)
{
PCRE2_SPTR ptrstack[PTR_STACK_SIZE];
uint32_t ptrstackptr = 0;
case_state forcecase = { PCRE2_SUBSTITUTE_CASE_NONE, FALSE };
PCRE2_SIZE casestart_offset = 0;
PCRE2_SIZE casestart_extra_needed = 0;
if (use_existing_match)
{
rc = match_data->rc;
use_existing_match = FALSE;
}
else rc = pcre2_match(code, subject, length, start_offset, options|goptions,
match_data, mcontext);
#ifdef SUPPORT_UNICODE
if (utf) options |= PCRE2_NO_UTF_CHECK; /* Only need to check once */
#endif
/* Any error other than no match returns the error code. No match breaks the
global loop. */
if (rc == PCRE2_ERROR_NOMATCH) break;
if (rc < 0) goto EXIT;
/* Handle a successful match. Matches that use \K to end before they start
or start before the current point in the subject are not supported. */
if (ovector[1] < ovector[0] || ovector[0] < start_offset)
{
rc = PCRE2_ERROR_BADSUBSPATTERN;
goto EXIT;
}
/* Assert that our replacement loop is making progress, checked even in
release builds. This should be impossible to hit, however, an infinite loop
would be fairly catastrophic.
"Progress" is measured as ovector[1] strictly advancing, or, an empty match
after a non-empty match. */
if (subs > 0 &&
!(ovector[1] > ovecsave[1] ||
(ovector[1] == ovector[0] && ovecsave[1] > ovecsave[0] &&
ovector[1] == ovecsave[1])))
{
rc = PCRE2_ERROR_INTERNAL_DUPMATCH;
goto EXIT;
}
ovecsave[0] = ovector[0];
ovecsave[1] = ovector[1];
/* Count substitutions with a paranoid check for integer overflow; surely no
real call to this function would ever hit this! */
if (subs == INT_MAX)
{
rc = PCRE2_ERROR_TOOMANYREPLACE;
goto EXIT;
}
subs++;
/* Copy the text leading up to the match (unless not required); remember
where the insert begins and how many ovector pairs are set; and remember how
much space we have requested in extra_needed. */
if (rc == 0) rc = ovector_count;
fraglength = ovector[0] - start_offset;
if (!replacement_only) CHECKMEMCPY(subject + start_offset, fraglength);
scb.output_offsets[0] = buff_offset;
scb.oveccount = rc;
sub_start_extra_needed = extra_needed;
/* Process the replacement string. If the entire replacement is literal, just
copy it with length check. */
ptr = replacement;
if ((suboptions & PCRE2_SUBSTITUTE_LITERAL) != 0)
{
CHECKMEMCPY(ptr, rlength);
}
/* Within a non-literal replacement, which must be scanned character by
character, local literal mode can be set by \Q, but only in extended mode
when backslashes are being interpreted. In extended mode we must handle
nested substrings that are to be reprocessed. */
else for (;;)
{
uint32_t ch;
unsigned int chlen;
int group;