-
Notifications
You must be signed in to change notification settings - Fork 0
/
pcreapi.3
2918 lines (2918 loc) · 126 KB
/
pcreapi.3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
.TH PCREAPI 3 "18 December 2015" "PCRE 8.39"
.SH NAME
PCRE - Perl-compatible regular expressions
.sp
.B #include <pcre.h>
.
.
.SH "PCRE NATIVE API BASIC FUNCTIONS"
.rs
.sp
.nf
.B pcre *pcre_compile(const char *\fIpattern\fP, int \fIoptions\fP,
.B " const char **\fIerrptr\fP, int *\fIerroffset\fP,"
.B " const unsigned char *\fItableptr\fP);"
.sp
.B pcre *pcre_compile2(const char *\fIpattern\fP, int \fIoptions\fP,
.B " int *\fIerrorcodeptr\fP,"
.B " const char **\fIerrptr\fP, int *\fIerroffset\fP,"
.B " const unsigned char *\fItableptr\fP);"
.sp
.B pcre_extra *pcre_study(const pcre *\fIcode\fP, int \fIoptions\fP,
.B " const char **\fIerrptr\fP);"
.sp
.B void pcre_free_study(pcre_extra *\fIextra\fP);
.sp
.B int pcre_exec(const pcre *\fIcode\fP, "const pcre_extra *\fIextra\fP,"
.B " const char *\fIsubject\fP, int \fIlength\fP, int \fIstartoffset\fP,"
.B " int \fIoptions\fP, int *\fIovector\fP, int \fIovecsize\fP);"
.sp
.B int pcre_dfa_exec(const pcre *\fIcode\fP, "const pcre_extra *\fIextra\fP,"
.B " const char *\fIsubject\fP, int \fIlength\fP, int \fIstartoffset\fP,"
.B " int \fIoptions\fP, int *\fIovector\fP, int \fIovecsize\fP,"
.B " int *\fIworkspace\fP, int \fIwscount\fP);"
.fi
.
.
.SH "PCRE NATIVE API STRING EXTRACTION FUNCTIONS"
.rs
.sp
.nf
.B int pcre_copy_named_substring(const pcre *\fIcode\fP,
.B " const char *\fIsubject\fP, int *\fIovector\fP,"
.B " int \fIstringcount\fP, const char *\fIstringname\fP,"
.B " char *\fIbuffer\fP, int \fIbuffersize\fP);"
.sp
.B int pcre_copy_substring(const char *\fIsubject\fP, int *\fIovector\fP,
.B " int \fIstringcount\fP, int \fIstringnumber\fP, char *\fIbuffer\fP,"
.B " int \fIbuffersize\fP);"
.sp
.B int pcre_get_named_substring(const pcre *\fIcode\fP,
.B " const char *\fIsubject\fP, int *\fIovector\fP,"
.B " int \fIstringcount\fP, const char *\fIstringname\fP,"
.B " const char **\fIstringptr\fP);"
.sp
.B int pcre_get_stringnumber(const pcre *\fIcode\fP,
.B " const char *\fIname\fP);"
.sp
.B int pcre_get_stringtable_entries(const pcre *\fIcode\fP,
.B " const char *\fIname\fP, char **\fIfirst\fP, char **\fIlast\fP);"
.sp
.B int pcre_get_substring(const char *\fIsubject\fP, int *\fIovector\fP,
.B " int \fIstringcount\fP, int \fIstringnumber\fP,"
.B " const char **\fIstringptr\fP);"
.sp
.B int pcre_get_substring_list(const char *\fIsubject\fP,
.B " int *\fIovector\fP, int \fIstringcount\fP, const char ***\fIlistptr\fP);"
.sp
.B void pcre_free_substring(const char *\fIstringptr\fP);
.sp
.B void pcre_free_substring_list(const char **\fIstringptr\fP);
.fi
.
.
.SH "PCRE NATIVE API AUXILIARY FUNCTIONS"
.rs
.sp
.nf
.B int pcre_jit_exec(const pcre *\fIcode\fP, "const pcre_extra *\fIextra\fP,"
.B " const char *\fIsubject\fP, int \fIlength\fP, int \fIstartoffset\fP,"
.B " int \fIoptions\fP, int *\fIovector\fP, int \fIovecsize\fP,"
.B " pcre_jit_stack *\fIjstack\fP);"
.sp
.B pcre_jit_stack *pcre_jit_stack_alloc(int \fIstartsize\fP, int \fImaxsize\fP);
.sp
.B void pcre_jit_stack_free(pcre_jit_stack *\fIstack\fP);
.sp
.B void pcre_assign_jit_stack(pcre_extra *\fIextra\fP,
.B " pcre_jit_callback \fIcallback\fP, void *\fIdata\fP);"
.sp
.B const unsigned char *pcre_maketables(void);
.sp
.B int pcre_fullinfo(const pcre *\fIcode\fP, "const pcre_extra *\fIextra\fP,"
.B " int \fIwhat\fP, void *\fIwhere\fP);"
.sp
.B int pcre_refcount(pcre *\fIcode\fP, int \fIadjust\fP);
.sp
.B int pcre_config(int \fIwhat\fP, void *\fIwhere\fP);
.sp
.B const char *pcre_version(void);
.sp
.B int pcre_pattern_to_host_byte_order(pcre *\fIcode\fP,
.B " pcre_extra *\fIextra\fP, const unsigned char *\fItables\fP);"
.fi
.
.
.SH "PCRE NATIVE API INDIRECTED FUNCTIONS"
.rs
.sp
.nf
.B void *(*pcre_malloc)(size_t);
.sp
.B void (*pcre_free)(void *);
.sp
.B void *(*pcre_stack_malloc)(size_t);
.sp
.B void (*pcre_stack_free)(void *);
.sp
.B int (*pcre_callout)(pcre_callout_block *);
.sp
.B int (*pcre_stack_guard)(void);
.fi
.
.
.SH "PCRE 8-BIT, 16-BIT, AND 32-BIT LIBRARIES"
.rs
.sp
As well as support for 8-bit character strings, PCRE also supports 16-bit
strings (from release 8.30) and 32-bit strings (from release 8.32), by means of
two additional libraries. They can be built as well as, or instead of, the
8-bit library. To avoid too much complication, this document describes the
8-bit versions of the functions, with only occasional references to the 16-bit
and 32-bit libraries.
.P
The 16-bit and 32-bit functions operate in the same way as their 8-bit
counterparts; they just use different data types for their arguments and
results, and their names start with \fBpcre16_\fP or \fBpcre32_\fP instead of
\fBpcre_\fP. For every option that has UTF8 in its name (for example,
PCRE_UTF8), there are corresponding 16-bit and 32-bit names with UTF8 replaced
by UTF16 or UTF32, respectively. This facility is in fact just cosmetic; the
16-bit and 32-bit option names define the same bit values.
.P
References to bytes and UTF-8 in this document should be read as references to
16-bit data units and UTF-16 when using the 16-bit library, or 32-bit data
units and UTF-32 when using the 32-bit library, unless specified otherwise.
More details of the specific differences for the 16-bit and 32-bit libraries
are given in the
.\" HREF
\fBpcre16\fP
.\"
and
.\" HREF
\fBpcre32\fP
.\"
pages.
.
.
.SH "PCRE API OVERVIEW"
.rs
.sp
PCRE has its own native API, which is described in this document. There are
also some wrapper functions (for the 8-bit library only) that correspond to the
POSIX regular expression API, but they do not give access to all the
functionality. They are described in the
.\" HREF
\fBpcreposix\fP
.\"
documentation. Both of these APIs define a set of C function calls. A C++
wrapper (again for the 8-bit library only) is also distributed with PCRE. It is
documented in the
.\" HREF
\fBpcrecpp\fP
.\"
page.
.P
The native API C function prototypes are defined in the header file
\fBpcre.h\fP, and on Unix-like systems the (8-bit) library itself is called
\fBlibpcre\fP. It can normally be accessed by adding \fB-lpcre\fP to the
command for linking an application that uses PCRE. The header file defines the
macros PCRE_MAJOR and PCRE_MINOR to contain the major and minor release numbers
for the library. Applications can use these to include support for different
releases of PCRE.
.P
In a Windows environment, if you want to statically link an application program
against a non-dll \fBpcre.a\fP file, you must define PCRE_STATIC before
including \fBpcre.h\fP or \fBpcrecpp.h\fP, because otherwise the
\fBpcre_malloc()\fP and \fBpcre_free()\fP exported functions will be declared
\fB__declspec(dllimport)\fP, with unwanted results.
.P
The functions \fBpcre_compile()\fP, \fBpcre_compile2()\fP, \fBpcre_study()\fP,
and \fBpcre_exec()\fP are used for compiling and matching regular expressions
in a Perl-compatible manner. A sample program that demonstrates the simplest
way of using them is provided in the file called \fIpcredemo.c\fP in the PCRE
source distribution. A listing of this program is given in the
.\" HREF
\fBpcredemo\fP
.\"
documentation, and the
.\" HREF
\fBpcresample\fP
.\"
documentation describes how to compile and run it.
.P
Just-in-time compiler support is an optional feature of PCRE that can be built
in appropriate hardware environments. It greatly speeds up the matching
performance of many patterns. Simple programs can easily request that it be
used if available, by setting an option that is ignored when it is not
relevant. More complicated programs might need to make use of the functions
\fBpcre_jit_stack_alloc()\fP, \fBpcre_jit_stack_free()\fP, and
\fBpcre_assign_jit_stack()\fP in order to control the JIT code's memory usage.
.P
From release 8.32 there is also a direct interface for JIT execution, which
gives improved performance. The JIT-specific functions are discussed in the
.\" HREF
\fBpcrejit\fP
.\"
documentation.
.P
A second matching function, \fBpcre_dfa_exec()\fP, which is not
Perl-compatible, is also provided. This uses a different algorithm for the
matching. The alternative algorithm finds all possible matches (at a given
point in the subject), and scans the subject just once (unless there are
lookbehind assertions). However, this algorithm does not return captured
substrings. A description of the two matching algorithms and their advantages
and disadvantages is given in the
.\" HREF
\fBpcrematching\fP
.\"
documentation.
.P
In addition to the main compiling and matching functions, there are convenience
functions for extracting captured substrings from a subject string that is
matched by \fBpcre_exec()\fP. They are:
.sp
\fBpcre_copy_substring()\fP
\fBpcre_copy_named_substring()\fP
\fBpcre_get_substring()\fP
\fBpcre_get_named_substring()\fP
\fBpcre_get_substring_list()\fP
\fBpcre_get_stringnumber()\fP
\fBpcre_get_stringtable_entries()\fP
.sp
\fBpcre_free_substring()\fP and \fBpcre_free_substring_list()\fP are also
provided, to free the memory used for extracted strings.
.P
The function \fBpcre_maketables()\fP is used to build a set of character tables
in the current locale for passing to \fBpcre_compile()\fP, \fBpcre_exec()\fP,
or \fBpcre_dfa_exec()\fP. This is an optional facility that is provided for
specialist use. Most commonly, no special tables are passed, in which case
internal tables that are generated when PCRE is built are used.
.P
The function \fBpcre_fullinfo()\fP is used to find out information about a
compiled pattern. The function \fBpcre_version()\fP returns a pointer to a
string containing the version of PCRE and its date of release.
.P
The function \fBpcre_refcount()\fP maintains a reference count in a data block
containing a compiled pattern. This is provided for the benefit of
object-oriented applications.
.P
The global variables \fBpcre_malloc\fP and \fBpcre_free\fP initially contain
the entry points of the standard \fBmalloc()\fP and \fBfree()\fP functions,
respectively. PCRE calls the memory management functions via these variables,
so a calling program can replace them if it wishes to intercept the calls. This
should be done before calling any PCRE functions.
.P
The global variables \fBpcre_stack_malloc\fP and \fBpcre_stack_free\fP are also
indirections to memory management functions. These special functions are used
only when PCRE is compiled to use the heap for remembering data, instead of
recursive function calls, when running the \fBpcre_exec()\fP function. See the
.\" HREF
\fBpcrebuild\fP
.\"
documentation for details of how to do this. It is a non-standard way of
building PCRE, for use in environments that have limited stacks. Because of the
greater use of memory management, it runs more slowly. Separate functions are
provided so that special-purpose external code can be used for this case. When
used, these functions always allocate memory blocks of the same size. There is
a discussion about PCRE's stack usage in the
.\" HREF
\fBpcrestack\fP
.\"
documentation.
.P
The global variable \fBpcre_callout\fP initially contains NULL. It can be set
by the caller to a "callout" function, which PCRE will then call at specified
points during a matching operation. Details are given in the
.\" HREF
\fBpcrecallout\fP
.\"
documentation.
.P
The global variable \fBpcre_stack_guard\fP initially contains NULL. It can be
set by the caller to a function that is called by PCRE whenever it starts
to compile a parenthesized part of a pattern. When parentheses are nested, PCRE
uses recursive function calls, which use up the system stack. This function is
provided so that applications with restricted stacks can force a compilation
error if the stack runs out. The function should return zero if all is well, or
non-zero to force an error.
.
.
.\" HTML <a name="newlines"></a>
.SH NEWLINES
.rs
.sp
PCRE supports five different conventions for indicating line breaks in
strings: a single CR (carriage return) character, a single LF (linefeed)
character, the two-character sequence CRLF, any of the three preceding, or any
Unicode newline sequence. The Unicode newline sequences are the three just
mentioned, plus the single characters VT (vertical tab, U+000B), FF (form feed,
U+000C), NEL (next line, U+0085), LS (line separator, U+2028), and PS
(paragraph separator, U+2029).
.P
Each of the first three conventions is used by at least one operating system as
its standard newline sequence. When PCRE is built, a default can be specified.
The default default is LF, which is the Unix standard. When PCRE is run, the
default can be overridden, either when a pattern is compiled, or when it is
matched.
.P
At compile time, the newline convention can be specified by the \fIoptions\fP
argument of \fBpcre_compile()\fP, or it can be specified by special text at the
start of the pattern itself; this overrides any other settings. See the
.\" HREF
\fBpcrepattern\fP
.\"
page for details of the special character sequences.
.P
In the PCRE documentation the word "newline" is used to mean "the character or
pair of characters that indicate a line break". The choice of newline
convention affects the handling of the dot, circumflex, and dollar
metacharacters, the handling of #-comments in /x mode, and, when CRLF is a
recognized line ending sequence, the match position advancement for a
non-anchored pattern. There is more detail about this in the
.\" HTML <a href="#execoptions">
.\" </a>
section on \fBpcre_exec()\fP options
.\"
below.
.P
The choice of newline convention does not affect the interpretation of
the \en or \er escape sequences, nor does it affect what \eR matches, which is
controlled in a similar way, but by separate options.
.
.
.SH MULTITHREADING
.rs
.sp
The PCRE functions can be used in multi-threading applications, with the
proviso that the memory management functions pointed to by \fBpcre_malloc\fP,
\fBpcre_free\fP, \fBpcre_stack_malloc\fP, and \fBpcre_stack_free\fP, and the
callout and stack-checking functions pointed to by \fBpcre_callout\fP and
\fBpcre_stack_guard\fP, are shared by all threads.
.P
The compiled form of a regular expression is not altered during matching, so
the same compiled pattern can safely be used by several threads at once.
.P
If the just-in-time optimization feature is being used, it needs separate
memory stack areas for each thread. See the
.\" HREF
\fBpcrejit\fP
.\"
documentation for more details.
.
.
.SH "SAVING PRECOMPILED PATTERNS FOR LATER USE"
.rs
.sp
The compiled form of a regular expression can be saved and re-used at a later
time, possibly by a different program, and even on a host other than the one on
which it was compiled. Details are given in the
.\" HREF
\fBpcreprecompile\fP
.\"
documentation, which includes a description of the
\fBpcre_pattern_to_host_byte_order()\fP function. However, compiling a regular
expression with one version of PCRE for use with a different version is not
guaranteed to work and may cause crashes.
.
.
.SH "CHECKING BUILD-TIME OPTIONS"
.rs
.sp
.B int pcre_config(int \fIwhat\fP, void *\fIwhere\fP);
.PP
The function \fBpcre_config()\fP makes it possible for a PCRE client to
discover which optional features have been compiled into the PCRE library. The
.\" HREF
\fBpcrebuild\fP
.\"
documentation has more details about these optional features.
.P
The first argument for \fBpcre_config()\fP is an integer, specifying which
information is required; the second argument is a pointer to a variable into
which the information is placed. The returned value is zero on success, or the
negative error code PCRE_ERROR_BADOPTION if the value in the first argument is
not recognized. The following information is available:
.sp
PCRE_CONFIG_UTF8
.sp
The output is an integer that is set to one if UTF-8 support is available;
otherwise it is set to zero. This value should normally be given to the 8-bit
version of this function, \fBpcre_config()\fP. If it is given to the 16-bit
or 32-bit version of this function, the result is PCRE_ERROR_BADOPTION.
.sp
PCRE_CONFIG_UTF16
.sp
The output is an integer that is set to one if UTF-16 support is available;
otherwise it is set to zero. This value should normally be given to the 16-bit
version of this function, \fBpcre16_config()\fP. If it is given to the 8-bit
or 32-bit version of this function, the result is PCRE_ERROR_BADOPTION.
.sp
PCRE_CONFIG_UTF32
.sp
The output is an integer that is set to one if UTF-32 support is available;
otherwise it is set to zero. This value should normally be given to the 32-bit
version of this function, \fBpcre32_config()\fP. If it is given to the 8-bit
or 16-bit version of this function, the result is PCRE_ERROR_BADOPTION.
.sp
PCRE_CONFIG_UNICODE_PROPERTIES
.sp
The output is an integer that is set to one if support for Unicode character
properties is available; otherwise it is set to zero.
.sp
PCRE_CONFIG_JIT
.sp
The output is an integer that is set to one if support for just-in-time
compiling is available; otherwise it is set to zero.
.sp
PCRE_CONFIG_JITTARGET
.sp
The output is a pointer to a zero-terminated "const char *" string. If JIT
support is available, the string contains the name of the architecture for
which the JIT compiler is configured, for example "x86 32bit (little endian +
unaligned)". If JIT support is not available, the result is NULL.
.sp
PCRE_CONFIG_NEWLINE
.sp
The output is an integer whose value specifies the default character sequence
that is recognized as meaning "newline". The values that are supported in
ASCII/Unicode environments are: 10 for LF, 13 for CR, 3338 for CRLF, -2 for
ANYCRLF, and -1 for ANY. In EBCDIC environments, CR, ANYCRLF, and ANY yield the
same values. However, the value for LF is normally 21, though some EBCDIC
environments use 37. The corresponding values for CRLF are 3349 and 3365. The
default should normally correspond to the standard sequence for your operating
system.
.sp
PCRE_CONFIG_BSR
.sp
The output is an integer whose value indicates what character sequences the \eR
escape sequence matches by default. A value of 0 means that \eR matches any
Unicode line ending sequence; a value of 1 means that \eR matches only CR, LF,
or CRLF. The default can be overridden when a pattern is compiled or matched.
.sp
PCRE_CONFIG_LINK_SIZE
.sp
The output is an integer that contains the number of bytes used for internal
linkage in compiled regular expressions. For the 8-bit library, the value can
be 2, 3, or 4. For the 16-bit library, the value is either 2 or 4 and is still
a number of bytes. For the 32-bit library, the value is either 2 or 4 and is
still a number of bytes. The default value of 2 is sufficient for all but the
most massive patterns, since it allows the compiled pattern to be up to 64K in
size. Larger values allow larger regular expressions to be compiled, at the
expense of slower matching.
.sp
PCRE_CONFIG_POSIX_MALLOC_THRESHOLD
.sp
The output is an integer that contains the threshold above which the POSIX
interface uses \fBmalloc()\fP for output vectors. Further details are given in
the
.\" HREF
\fBpcreposix\fP
.\"
documentation.
.sp
PCRE_CONFIG_PARENS_LIMIT
.sp
The output is a long integer that gives the maximum depth of nesting of
parentheses (of any kind) in a pattern. This limit is imposed to cap the amount
of system stack used when a pattern is compiled. It is specified when PCRE is
built; the default is 250. This limit does not take into account the stack that
may already be used by the calling application. For finer control over
compilation stack usage, you can set a pointer to an external checking function
in \fBpcre_stack_guard\fP.
.sp
PCRE_CONFIG_MATCH_LIMIT
.sp
The output is a long integer that gives the default limit for the number of
internal matching function calls in a \fBpcre_exec()\fP execution. Further
details are given with \fBpcre_exec()\fP below.
.sp
PCRE_CONFIG_MATCH_LIMIT_RECURSION
.sp
The output is a long integer that gives the default limit for the depth of
recursion when calling the internal matching function in a \fBpcre_exec()\fP
execution. Further details are given with \fBpcre_exec()\fP below.
.sp
PCRE_CONFIG_STACKRECURSE
.sp
The output is an integer that is set to one if internal recursion when running
\fBpcre_exec()\fP is implemented by recursive function calls that use the stack
to remember their state. This is the usual way that PCRE is compiled. The
output is zero if PCRE was compiled to use blocks of data on the heap instead
of recursive function calls. In this case, \fBpcre_stack_malloc\fP and
\fBpcre_stack_free\fP are called to manage memory blocks on the heap, thus
avoiding the use of the stack.
.
.
.SH "COMPILING A PATTERN"
.rs
.sp
.nf
.B pcre *pcre_compile(const char *\fIpattern\fP, int \fIoptions\fP,
.B " const char **\fIerrptr\fP, int *\fIerroffset\fP,"
.B " const unsigned char *\fItableptr\fP);"
.sp
.B pcre *pcre_compile2(const char *\fIpattern\fP, int \fIoptions\fP,
.B " int *\fIerrorcodeptr\fP,"
.B " const char **\fIerrptr\fP, int *\fIerroffset\fP,"
.B " const unsigned char *\fItableptr\fP);"
.fi
.P
Either of the functions \fBpcre_compile()\fP or \fBpcre_compile2()\fP can be
called to compile a pattern into an internal form. The only difference between
the two interfaces is that \fBpcre_compile2()\fP has an additional argument,
\fIerrorcodeptr\fP, via which a numerical error code can be returned. To avoid
too much repetition, we refer just to \fBpcre_compile()\fP below, but the
information applies equally to \fBpcre_compile2()\fP.
.P
The pattern is a C string terminated by a binary zero, and is passed in the
\fIpattern\fP argument. A pointer to a single block of memory that is obtained
via \fBpcre_malloc\fP is returned. This contains the compiled code and related
data. The \fBpcre\fP type is defined for the returned block; this is a typedef
for a structure whose contents are not externally defined. It is up to the
caller to free the memory (via \fBpcre_free\fP) when it is no longer required.
.P
Although the compiled code of a PCRE regex is relocatable, that is, it does not
depend on memory location, the complete \fBpcre\fP data block is not
fully relocatable, because it may contain a copy of the \fItableptr\fP
argument, which is an address (see below).
.P
The \fIoptions\fP argument contains various bit settings that affect the
compilation. It should be zero if no options are required. The available
options are described below. Some of them (in particular, those that are
compatible with Perl, but some others as well) can also be set and unset from
within the pattern (see the detailed description in the
.\" HREF
\fBpcrepattern\fP
.\"
documentation). For those options that can be different in different parts of
the pattern, the contents of the \fIoptions\fP argument specifies their
settings at the start of compilation and execution. The PCRE_ANCHORED,
PCRE_BSR_\fIxxx\fP, PCRE_NEWLINE_\fIxxx\fP, PCRE_NO_UTF8_CHECK, and
PCRE_NO_START_OPTIMIZE options can be set at the time of matching as well as at
compile time.
.P
If \fIerrptr\fP is NULL, \fBpcre_compile()\fP returns NULL immediately.
Otherwise, if compilation of a pattern fails, \fBpcre_compile()\fP returns
NULL, and sets the variable pointed to by \fIerrptr\fP to point to a textual
error message. This is a static string that is part of the library. You must
not try to free it. Normally, the offset from the start of the pattern to the
data unit that was being processed when the error was discovered is placed in
the variable pointed to by \fIerroffset\fP, which must not be NULL (if it is,
an immediate error is given). However, for an invalid UTF-8 or UTF-16 string,
the offset is that of the first data unit of the failing character.
.P
Some errors are not detected until the whole pattern has been scanned; in these
cases, the offset passed back is the length of the pattern. Note that the
offset is in data units, not characters, even in a UTF mode. It may sometimes
point into the middle of a UTF-8 or UTF-16 character.
.P
If \fBpcre_compile2()\fP is used instead of \fBpcre_compile()\fP, and the
\fIerrorcodeptr\fP argument is not NULL, a non-zero error code number is
returned via this argument in the event of an error. This is in addition to the
textual error message. Error codes and messages are listed below.
.P
If the final argument, \fItableptr\fP, is NULL, PCRE uses a default set of
character tables that are built when PCRE is compiled, using the default C
locale. Otherwise, \fItableptr\fP must be an address that is the result of a
call to \fBpcre_maketables()\fP. This value is stored with the compiled
pattern, and used again by \fBpcre_exec()\fP and \fBpcre_dfa_exec()\fP when the
pattern is matched. For more discussion, see the section on locale support
below.
.P
This code fragment shows a typical straightforward call to \fBpcre_compile()\fP:
.sp
pcre *re;
const char *error;
int erroffset;
re = pcre_compile(
"^A.*Z", /* the pattern */
0, /* default options */
&error, /* for error message */
&erroffset, /* for error offset */
NULL); /* use default character tables */
.sp
The following names for option bits are defined in the \fBpcre.h\fP header
file:
.sp
PCRE_ANCHORED
.sp
If this bit is set, the pattern is forced to be "anchored", that is, it is
constrained to match only at the first matching point in the string that is
being searched (the "subject string"). This effect can also be achieved by
appropriate constructs in the pattern itself, which is the only way to do it in
Perl.
.sp
PCRE_AUTO_CALLOUT
.sp
If this bit is set, \fBpcre_compile()\fP automatically inserts callout items,
all with number 255, before each pattern item. For discussion of the callout
facility, see the
.\" HREF
\fBpcrecallout\fP
.\"
documentation.
.sp
PCRE_BSR_ANYCRLF
PCRE_BSR_UNICODE
.sp
These options (which are mutually exclusive) control what the \eR escape
sequence matches. The choice is either to match only CR, LF, or CRLF, or to
match any Unicode newline sequence. The default is specified when PCRE is
built. It can be overridden from within the pattern, or by setting an option
when a compiled pattern is matched.
.sp
PCRE_CASELESS
.sp
If this bit is set, letters in the pattern match both upper and lower case
letters. It is equivalent to Perl's /i option, and it can be changed within a
pattern by a (?i) option setting. In UTF-8 mode, PCRE always understands the
concept of case for characters whose values are less than 128, so caseless
matching is always possible. For characters with higher values, the concept of
case is supported if PCRE is compiled with Unicode property support, but not
otherwise. If you want to use caseless matching for characters 128 and above,
you must ensure that PCRE is compiled with Unicode property support as well as
with UTF-8 support.
.sp
PCRE_DOLLAR_ENDONLY
.sp
If this bit is set, a dollar metacharacter in the pattern matches only at the
end of the subject string. Without this option, a dollar also matches
immediately before a newline at the end of the string (but not before any other
newlines). The PCRE_DOLLAR_ENDONLY option is ignored if PCRE_MULTILINE is set.
There is no equivalent to this option in Perl, and no way to set it within a
pattern.
.sp
PCRE_DOTALL
.sp
If this bit is set, a dot metacharacter in the pattern matches a character of
any value, including one that indicates a newline. However, it only ever
matches one character, even if newlines are coded as CRLF. Without this option,
a dot does not match when the current position is at a newline. This option is
equivalent to Perl's /s option, and it can be changed within a pattern by a
(?s) option setting. A negative class such as [^a] always matches newline
characters, independent of the setting of this option.
.sp
PCRE_DUPNAMES
.sp
If this bit is set, names used to identify capturing subpatterns need not be
unique. This can be helpful for certain types of pattern when it is known that
only one instance of the named subpattern can ever be matched. There are more
details of named subpatterns below; see also the
.\" HREF
\fBpcrepattern\fP
.\"
documentation.
.sp
PCRE_EXTENDED
.sp
If this bit is set, most white space characters in the pattern are totally
ignored except when escaped or inside a character class. However, white space
is not allowed within sequences such as (?> that introduce various
parenthesized subpatterns, nor within a numerical quantifier such as {1,3}.
However, ignorable white space is permitted between an item and a following
quantifier and between a quantifier and a following + that indicates
possessiveness.
.P
White space did not used to include the VT character (code 11), because Perl
did not treat this character as white space. However, Perl changed at release
5.18, so PCRE followed at release 8.34, and VT is now treated as white space.
.P
PCRE_EXTENDED also causes characters between an unescaped # outside a character
class and the next newline, inclusive, to be ignored. PCRE_EXTENDED is
equivalent to Perl's /x option, and it can be changed within a pattern by a
(?x) option setting.
.P
Which characters are interpreted as newlines is controlled by the options
passed to \fBpcre_compile()\fP or by a special sequence at the start of the
pattern, as described in the section entitled
.\" HTML <a href="pcrepattern.html#newlines">
.\" </a>
"Newline conventions"
.\"
in the \fBpcrepattern\fP documentation. Note that the end of this type of
comment is a literal newline sequence in the pattern; escape sequences that
happen to represent a newline do not count.
.P
This option makes it possible to include comments inside complicated patterns.
Note, however, that this applies only to data characters. White space characters
may never appear within special character sequences in a pattern, for example
within the sequence (?( that introduces a conditional subpattern.
.sp
PCRE_EXTRA
.sp
This option was invented in order to turn on additional functionality of PCRE
that is incompatible with Perl, but it is currently of very little use. When
set, any backslash in a pattern that is followed by a letter that has no
special meaning causes an error, thus reserving these combinations for future
expansion. By default, as in Perl, a backslash followed by a letter with no
special meaning is treated as a literal. (Perl can, however, be persuaded to
give an error for this, by running it with the -w option.) There are at present
no other features controlled by this option. It can also be set by a (?X)
option setting within a pattern.
.sp
PCRE_FIRSTLINE
.sp
If this option is set, an unanchored pattern is required to match before or at
the first newline in the subject string, though the matched text may continue
over the newline.
.sp
PCRE_JAVASCRIPT_COMPAT
.sp
If this option is set, PCRE's behaviour is changed in some ways so that it is
compatible with JavaScript rather than Perl. The changes are as follows:
.P
(1) A lone closing square bracket in a pattern causes a compile-time error,
because this is illegal in JavaScript (by default it is treated as a data
character). Thus, the pattern AB]CD becomes illegal when this option is set.
.P
(2) At run time, a back reference to an unset subpattern group matches an empty
string (by default this causes the current matching alternative to fail). A
pattern such as (\e1)(a) succeeds when this option is set (assuming it can find
an "a" in the subject), whereas it fails by default, for Perl compatibility.
.P
(3) \eU matches an upper case "U" character; by default \eU causes a compile
time error (Perl uses \eU to upper case subsequent characters).
.P
(4) \eu matches a lower case "u" character unless it is followed by four
hexadecimal digits, in which case the hexadecimal number defines the code point
to match. By default, \eu causes a compile time error (Perl uses it to upper
case the following character).
.P
(5) \ex matches a lower case "x" character unless it is followed by two
hexadecimal digits, in which case the hexadecimal number defines the code point
to match. By default, as in Perl, a hexadecimal number is always expected after
\ex, but it may have zero, one, or two digits (so, for example, \exz matches a
binary zero character followed by z).
.sp
PCRE_MULTILINE
.sp
By default, for the purposes of matching "start of line" and "end of line",
PCRE treats the subject string as consisting of a single line of characters,
even if it actually contains newlines. The "start of line" metacharacter (^)
matches only at the start of the string, and the "end of line" metacharacter
($) matches only at the end of the string, or before a terminating newline
(except when PCRE_DOLLAR_ENDONLY is set). Note, however, that unless
PCRE_DOTALL is set, the "any character" metacharacter (.) does not match at a
newline. This behaviour (for ^, $, and dot) is the same as Perl.
.P
When PCRE_MULTILINE it is set, the "start of line" and "end of line" constructs
match immediately following or immediately before internal newlines in the
subject string, respectively, as well as at the very start and end. This is
equivalent to Perl's /m option, and it can be changed within a pattern by a
(?m) option setting. If there are no newlines in a subject string, or no
occurrences of ^ or $ in a pattern, setting PCRE_MULTILINE has no effect.
.sp
PCRE_NEVER_UTF
.sp
This option locks out interpretation of the pattern as UTF-8 (or UTF-16 or
UTF-32 in the 16-bit and 32-bit libraries). In particular, it prevents the
creator of the pattern from switching to UTF interpretation by starting the
pattern with (*UTF). This may be useful in applications that process patterns
from external sources. The combination of PCRE_UTF8 and PCRE_NEVER_UTF also
causes an error.
.sp
PCRE_NEWLINE_CR
PCRE_NEWLINE_LF
PCRE_NEWLINE_CRLF
PCRE_NEWLINE_ANYCRLF
PCRE_NEWLINE_ANY
.sp
These options override the default newline definition that was chosen when PCRE
was built. Setting the first or the second specifies that a newline is
indicated by a single character (CR or LF, respectively). Setting
PCRE_NEWLINE_CRLF specifies that a newline is indicated by the two-character
CRLF sequence. Setting PCRE_NEWLINE_ANYCRLF specifies that any of the three
preceding sequences should be recognized. Setting PCRE_NEWLINE_ANY specifies
that any Unicode newline sequence should be recognized.
.P
In an ASCII/Unicode environment, the Unicode newline sequences are the three
just mentioned, plus the single characters VT (vertical tab, U+000B), FF (form
feed, U+000C), NEL (next line, U+0085), LS (line separator, U+2028), and PS
(paragraph separator, U+2029). For the 8-bit library, the last two are
recognized only in UTF-8 mode.
.P
When PCRE is compiled to run in an EBCDIC (mainframe) environment, the code for
CR is 0x0d, the same as ASCII. However, the character code for LF is normally
0x15, though in some EBCDIC environments 0x25 is used. Whichever of these is
not LF is made to correspond to Unicode's NEL character. EBCDIC codes are all
less than 256. For more details, see the
.\" HREF
\fBpcrebuild\fP
.\"
documentation.
.P
The newline setting in the options word uses three bits that are treated
as a number, giving eight possibilities. Currently only six are used (default
plus the five values above). This means that if you set more than one newline
option, the combination may or may not be sensible. For example,
PCRE_NEWLINE_CR with PCRE_NEWLINE_LF is equivalent to PCRE_NEWLINE_CRLF, but
other combinations may yield unused numbers and cause an error.
.P
The only time that a line break in a pattern is specially recognized when
compiling is when PCRE_EXTENDED is set. CR and LF are white space characters,
and so are ignored in this mode. Also, an unescaped # outside a character class
indicates a comment that lasts until after the next line break sequence. In
other circumstances, line break sequences in patterns are treated as literal
data.
.P
The newline option that is set at compile time becomes the default that is used
for \fBpcre_exec()\fP and \fBpcre_dfa_exec()\fP, but it can be overridden.
.sp
PCRE_NO_AUTO_CAPTURE
.sp
If this option is set, it disables the use of numbered capturing parentheses in
the pattern. Any opening parenthesis that is not followed by ? behaves as if it
were followed by ?: but named parentheses can still be used for capturing (and
they acquire numbers in the usual way). There is no equivalent of this option
in Perl.
.sp
PCRE_NO_AUTO_POSSESS
.sp
If this option is set, it disables "auto-possessification". This is an
optimization that, for example, turns a+b into a++b in order to avoid
backtracks into a+ that can never be successful. However, if callouts are in
use, auto-possessification means that some of them are never taken. You can set
this option if you want the matching functions to do a full unoptimized search
and run all the callouts, but it is mainly provided for testing purposes.
.sp
PCRE_NO_START_OPTIMIZE
.sp
This is an option that acts at matching time; that is, it is really an option
for \fBpcre_exec()\fP or \fBpcre_dfa_exec()\fP. If it is set at compile time,
it is remembered with the compiled pattern and assumed at matching time. This
is necessary if you want to use JIT execution, because the JIT compiler needs
to know whether or not this option is set. For details see the discussion of
PCRE_NO_START_OPTIMIZE
.\" HTML <a href="#execoptions">
.\" </a>
below.
.\"
.sp
PCRE_UCP
.sp
This option changes the way PCRE processes \eB, \eb, \eD, \ed, \eS, \es, \eW,
\ew, and some of the POSIX character classes. By default, only ASCII characters
are recognized, but if PCRE_UCP is set, Unicode properties are used instead to
classify characters. More details are given in the section on
.\" HTML <a href="pcre.html#genericchartypes">
.\" </a>
generic character types
.\"
in the
.\" HREF
\fBpcrepattern\fP
.\"
page. If you set PCRE_UCP, matching one of the items it affects takes much
longer. The option is available only if PCRE has been compiled with Unicode
property support.
.sp
PCRE_UNGREEDY
.sp
This option inverts the "greediness" of the quantifiers so that they are not
greedy by default, but become greedy if followed by "?". It is not compatible
with Perl. It can also be set by a (?U) option setting within the pattern.
.sp
PCRE_UTF8
.sp
This option causes PCRE to regard both the pattern and the subject as strings
of UTF-8 characters instead of single-byte strings. However, it is available
only when PCRE is built to include UTF support. If not, the use of this option
provokes an error. Details of how this option changes the behaviour of PCRE are
given in the
.\" HREF
\fBpcreunicode\fP
.\"
page.
.sp
PCRE_NO_UTF8_CHECK
.sp
When PCRE_UTF8 is set, the validity of the pattern as a UTF-8 string is
automatically checked. There is a discussion about the
.\" HTML <a href="pcreunicode.html#utf8strings">
.\" </a>
validity of UTF-8 strings
.\"
in the
.\" HREF
\fBpcreunicode\fP
.\"
page. If an invalid UTF-8 sequence is found, \fBpcre_compile()\fP returns an
error. If you already know that your pattern is valid, and you want to skip
this check for performance reasons, you can set the PCRE_NO_UTF8_CHECK option.
When it is set, the effect of passing an invalid UTF-8 string as a pattern is
undefined. It may cause your program to crash or loop. Note that this option
can also be passed to \fBpcre_exec()\fP and \fBpcre_dfa_exec()\fP, to suppress
the validity checking of subject strings only. If the same string is being
matched many times, the option can be safely set for the second and subsequent
matchings to improve performance.
.
.
.SH "COMPILATION ERROR CODES"
.rs
.sp
The following table lists the error codes than may be returned by
\fBpcre_compile2()\fP, along with the error messages that may be returned by
both compiling functions. Note that error messages are always 8-bit ASCII
strings, even in 16-bit or 32-bit mode. As PCRE has developed, some error codes
have fallen out of use. To avoid confusion, they have not been re-used.
.sp
0 no error
1 \e at end of pattern
2 \ec at end of pattern
3 unrecognized character follows \e
4 numbers out of order in {} quantifier
5 number too big in {} quantifier
6 missing terminating ] for character class
7 invalid escape sequence in character class
8 range out of order in character class
9 nothing to repeat
10 [this code is not in use]
11 internal error: unexpected repeat
12 unrecognized character after (? or (?-
13 POSIX named classes are supported only within a class
14 missing )
15 reference to non-existent subpattern
16 erroffset passed as NULL
17 unknown option bit(s) set
18 missing ) after comment
19 [this code is not in use]
20 regular expression is too large
21 failed to get memory
22 unmatched parentheses
23 internal error: code overflow
24 unrecognized character after (?<
25 lookbehind assertion is not fixed length
26 malformed number or name after (?(
27 conditional group contains more than two branches
28 assertion expected after (?(
29 (?R or (?[+-]digits must be followed by )
30 unknown POSIX class name
31 POSIX collating elements are not supported
32 this version of PCRE is compiled without UTF support
33 [this code is not in use]
34 character value in \ex{} or \eo{} is too large
35 invalid condition (?(0)
36 \eC not allowed in lookbehind assertion
37 PCRE does not support \eL, \el, \eN{name}, \eU, or \eu
38 number after (?C is > 255
39 closing ) for (?C expected
40 recursive call could loop indefinitely
41 unrecognized character after (?P
42 syntax error in subpattern name (missing terminator)
43 two named subpatterns have the same name
44 invalid UTF-8 string (specifically UTF-8)
45 support for \eP, \ep, and \eX has not been compiled
46 malformed \eP or \ep sequence
47 unknown property name after \eP or \ep
48 subpattern name is too long (maximum 32 characters)
49 too many named subpatterns (maximum 10000)
50 [this code is not in use]
51 octal value is greater than \e377 in 8-bit non-UTF-8 mode
52 internal error: overran compiling workspace
53 internal error: previously-checked referenced subpattern
not found
54 DEFINE group contains more than one branch
55 repeating a DEFINE group is not allowed
56 inconsistent NEWLINE options
57 \eg is not followed by a braced, angle-bracketed, or quoted
name/number or by a plain number
58 a numbered reference must not be zero
59 an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)
60 (*VERB) not recognized or malformed
61 number is too big
62 subpattern name expected
63 digit expected after (?+
64 ] is an invalid data character in JavaScript compatibility mode
65 different names for subpatterns of the same number are
not allowed
66 (*MARK) must have an argument
67 this version of PCRE is not compiled with Unicode property
support
68 \ec must be followed by an ASCII character
69 \ek is not followed by a braced, angle-bracketed, or quoted name
70 internal error: unknown opcode in find_fixedlength()
71 \eN is not supported in a class
72 too many forward references
73 disallowed Unicode code point (>= 0xd800 && <= 0xdfff)
74 invalid UTF-16 string (specifically UTF-16)
75 name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)
76 character value in \eu.... sequence is too large
77 invalid UTF-32 string (specifically UTF-32)