@@ -19,8 +19,9 @@ project(Tokenizers)
19
19
option (TOKENIZERS_BUILD_TEST "Build tests" OFF )
20
20
option (TOKENIZERS_BUILD_TOOLS "Build tools" OFF )
21
21
option (SUPPORT_REGEX_LOOKAHEAD
22
- "Support regex lookahead patterns (requires PCRE2)" OFF )
22
+ "Support regex lookahead patterns (requires PCRE2)" OFF )
23
23
24
+ include (Utils.cmake )
24
25
# Ignore weak attribute warning
25
26
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-attributes" )
26
27
@@ -34,20 +35,6 @@ add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/abseil-cpp)
34
35
add_subdirectory (${CMAKE_CURRENT_SOURCE_DIR} /third-party/re2 )
35
36
add_subdirectory (${CMAKE_CURRENT_SOURCE_DIR} /third-party/sentencepiece )
36
37
37
- # Configure PCRE2
38
- if (SUPPORT_REGEX_LOOKAHEAD OR TOKENIZERS_BUILD_TEST )
39
- set (PCRE2_BUILD_PCRE2_8 ON )
40
- set (PCRE2_BUILD_PCRE2_16 OFF )
41
- set (PCRE2_BUILD_PCRE2_32 OFF )
42
- set (PCRE2_BUILD_TESTS OFF )
43
- set (PCRE2_BUILD_PCRE2GREP OFF )
44
- set (PCRE2_BUILD_PCRE2TEST OFF )
45
- set (PCRE2_BUILD_PCRE2GPERF OFF )
46
- set (PCRE2_BUILD_DOCS OFF )
47
- set (PCRE2_BUILD_LIBPCRE2_PDB OFF )
48
- add_subdirectory (${CMAKE_CURRENT_SOURCE_DIR} /third-party/pcre2 )
49
- endif ()
50
-
51
38
set (CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag} )
52
39
53
40
file (GLOB tokenizers_source_files ${CMAKE_CURRENT_SOURCE_DIR} /src/*.cpp )
@@ -60,14 +47,8 @@ set(tokenizers_source_files
60
47
${CMAKE_CURRENT_SOURCE_DIR} /src/regex.cpp
61
48
${CMAKE_CURRENT_SOURCE_DIR} /src/sentencepiece.cpp
62
49
${CMAKE_CURRENT_SOURCE_DIR} /src/tiktoken.cpp
63
- ${CMAKE_CURRENT_SOURCE_DIR} /src/token_decoder.cpp
64
- )
65
- if (SUPPORT_REGEX_LOOKAHEAD OR TOKENIZERS_BUILD_TEST )
66
- list (APPEND
67
- tokenizers_source_files
68
- ${CMAKE_CURRENT_SOURCE_DIR} /src/pcre2_regex.cpp
69
- ${CMAKE_CURRENT_SOURCE_DIR} /src/std_regex.cpp )
70
- endif ()
50
+ ${CMAKE_CURRENT_SOURCE_DIR} /src/token_decoder.cpp )
51
+
71
52
file (GLOB unicode_source_files
72
53
${CMAKE_CURRENT_SOURCE_DIR} /third-party/llama.cpp-unicode/src/*.cpp )
73
54
add_library (tokenizers STATIC ${tokenizers_source_files}
@@ -85,11 +66,26 @@ target_include_directories(
85
66
target_link_libraries (tokenizers PUBLIC sentencepiece-static re2::re2 )
86
67
87
68
if (SUPPORT_REGEX_LOOKAHEAD OR TOKENIZERS_BUILD_TEST )
88
- target_include_directories (tokenizers
89
- PUBLIC
90
- ${CMAKE_CURRENT_SOURCE_DIR} /third-party/pcre2/src )
91
- target_link_libraries (tokenizers PUBLIC pcre2-8 )
92
- target_compile_definitions (tokenizers PUBLIC SUPPORT_REGEX_LOOKAHEAD )
69
+ set (PCRE2_BUILD_PCRE2_8 ON )
70
+ set (PCRE2_BUILD_PCRE2_16 OFF )
71
+ set (PCRE2_BUILD_PCRE2_32 OFF )
72
+ set (PCRE2_BUILD_TESTS OFF )
73
+ set (PCRE2_BUILD_PCRE2GREP OFF )
74
+ set (PCRE2_BUILD_PCRE2TEST OFF )
75
+ set (PCRE2_BUILD_PCRE2GPERF OFF )
76
+ set (PCRE2_BUILD_DOCS OFF )
77
+ set (PCRE2_BUILD_LIBPCRE2_PDB OFF )
78
+ add_subdirectory (${CMAKE_CURRENT_SOURCE_DIR} /third-party/pcre2 )
79
+ add_library (
80
+ regex_lookahead STATIC
81
+ ${CMAKE_CURRENT_SOURCE_DIR} /src/pcre2_regex.cpp
82
+ ${CMAKE_CURRENT_SOURCE_DIR} /src/regex_lookahead.cpp
83
+ ${CMAKE_CURRENT_SOURCE_DIR} /src/std_regex.cpp )
84
+ target_link_libraries (regex_lookahead PUBLIC pcre2-8 )
85
+ target_include_directories (
86
+ regex_lookahead PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} /include
87
+ ${CMAKE_CURRENT_SOURCE_DIR} /third-party/pcre2/src )
88
+ target_link_options_shared_lib (regex_lookahead )
93
89
endif ()
94
90
95
91
# Build test
@@ -120,9 +116,9 @@ if(TOKENIZERS_BUILD_TEST)
120
116
${CMAKE_CURRENT_SOURCE_DIR} /include
121
117
${CMAKE_CURRENT_SOURCE_DIR} /third-party/sentencepiece
122
118
${CMAKE_CURRENT_SOURCE_DIR} /third-party/re2
123
- ${CMAKE_CURRENT_SOURCE_DIR} /third-party/json/single_include
124
- ${CMAKE_CURRENT_SOURCE_DIR} /third-party/pcre2/src )
125
- target_link_libraries ( ${test_name} gtest_main GTest::gmock tokenizers )
119
+ ${CMAKE_CURRENT_SOURCE_DIR} /third-party/json/single_include )
120
+ target_link_libraries ( ${test_name} gtest_main GTest::gmock tokenizers
121
+ regex_lookahead )
126
122
add_test (${test_name} "${test_name} " )
127
123
set_tests_properties (${test_name} PROPERTIES ENVIRONMENT ${test_env} )
128
124
endforeach ()
0 commit comments