Skip to content

Commit b41b4ca

Browse files
examples : add "simple" (#1840)
* Create `simple.cpp` * minimalist example `CMakeLists.txt` * Update Makefile for minimalist example * remove 273: Trailing whitespace * removed trailing white spaces simple.cpp * typo and comments simple.cpp --------- Co-authored-by: Georgi Gerganov <[email protected]>
1 parent 13fe9d2 commit b41b4ca

File tree

3 files changed

+191
-1
lines changed

3 files changed

+191
-1
lines changed

Makefile

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# Define the default target now so that it is always the first target
2-
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch
2+
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch simple
33

44
ifdef LLAMA_BUILD_SERVER
55
BUILD_TARGETS += server
@@ -276,6 +276,12 @@ main: examples/main/main.cpp build-info.h ggml.
276276
@echo '==== Run ./main -h for help. ===='
277277
@echo
278278

279+
simple: examples/simple/simple.cpp build-info.h ggml.o llama.o common.o $(OBJS)
280+
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
281+
@echo
282+
@echo '==== Run ./simple -h for help. ===='
283+
@echo
284+
279285
quantize: examples/quantize/quantize.cpp build-info.h ggml.o llama.o $(OBJS)
280286
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
281287

examples/simple/CMakeLists.txt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
set(TARGET simple)
2+
add_executable(${TARGET} simple.cpp)
3+
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
4+
target_compile_features(${TARGET} PRIVATE cxx_std_11)
5+
if(TARGET BUILD_INFO)
6+
add_dependencies(${TARGET} BUILD_INFO)
7+
endif()

examples/simple/simple.cpp

Lines changed: 177 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,177 @@
1+
#ifndef _GNU_SOURCE
2+
#define _GNU_SOURCE
3+
#endif
4+
5+
#include "common.h"
6+
#include "llama.h"
7+
#include "build-info.h"
8+
9+
#include <cassert>
10+
#include <cinttypes>
11+
#include <cmath>
12+
#include <cstdio>
13+
#include <cstring>
14+
#include <ctime>
15+
#include <fstream>
16+
#include <iostream>
17+
#include <string>
18+
#include <vector>
19+
20+
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
21+
#include <signal.h>
22+
#include <unistd.h>
23+
#elif defined (_WIN32)
24+
#define WIN32_LEAN_AND_MEAN
25+
#define NOMINMAX
26+
#include <windows.h>
27+
#include <signal.h>
28+
#endif
29+
30+
31+
32+
int main(int argc, char ** argv)
33+
{
34+
gpt_params params;
35+
36+
//---------------------------------
37+
// Print help :
38+
//---------------------------------
39+
40+
if ( argc == 1 || argv[1][0] == '-' )
41+
{
42+
printf( "usage: %s MODEL_PATH [PROMPT]\n" , argv[0] );
43+
return 1 ;
44+
}
45+
46+
//---------------------------------
47+
// Load parameters :
48+
//---------------------------------
49+
50+
if ( argc >= 2 )
51+
{
52+
params.model = argv[1];
53+
}
54+
55+
if ( argc >= 3 )
56+
{
57+
params.prompt = argv[2];
58+
}
59+
60+
if ( params.prompt.empty() )
61+
{
62+
params.prompt = "Hello my name is";
63+
}
64+
65+
//---------------------------------
66+
// Init LLM :
67+
//---------------------------------
68+
69+
llama_init_backend();
70+
71+
llama_context * ctx ;
72+
73+
ctx = llama_init_from_gpt_params( params );
74+
75+
if ( ctx == NULL )
76+
{
77+
fprintf( stderr , "%s: error: unable to load model\n" , __func__ );
78+
return 1;
79+
}
80+
81+
//---------------------------------
82+
// Tokenize the prompt :
83+
//---------------------------------
84+
85+
std::vector<llama_token> tokens_list;
86+
tokens_list = ::llama_tokenize( ctx , params.prompt , true );
87+
88+
const int max_context_size = llama_n_ctx( ctx );
89+
const int max_tokens_list_size = max_context_size - 4 ;
90+
91+
if ( (int)tokens_list.size() > max_tokens_list_size )
92+
{
93+
fprintf( stderr , "%s: error: prompt too long (%d tokens, max %d)\n" ,
94+
__func__ , (int)tokens_list.size() , max_tokens_list_size );
95+
return 1;
96+
}
97+
98+
fprintf( stderr, "\n\n" );
99+
100+
// Print the tokens from the prompt :
101+
102+
for( auto id : tokens_list )
103+
{
104+
printf( "%s" , llama_token_to_str( ctx , id ) );
105+
}
106+
107+
fflush(stdout);
108+
109+
110+
//---------------------------------
111+
// Main prediction loop :
112+
//---------------------------------
113+
114+
// The LLM keeps a contextual cache memory of previous token evaluation.
115+
// Usually, once this cache is full, it is required to recompute a compressed context based on previous
116+
// tokens (see "infinite text generation via context swapping" in the main example), but in this minimalist
117+
// example, we will just stop the loop once this cache is full or once an end of stream is detected.
118+
119+
while ( llama_get_kv_cache_token_count( ctx ) < max_context_size )
120+
{
121+
//---------------------------------
122+
// Evaluate the tokens :
123+
//---------------------------------
124+
125+
if ( llama_eval( ctx , tokens_list.data() , tokens_list.size() , llama_get_kv_cache_token_count( ctx ) , params.n_threads ) )
126+
{
127+
fprintf( stderr, "%s : failed to eval\n" , __func__ );
128+
return 1;
129+
}
130+
131+
tokens_list.clear();
132+
133+
//---------------------------------
134+
// Select the best prediction :
135+
//---------------------------------
136+
137+
llama_token new_token_id = 0;
138+
139+
auto logits = llama_get_logits( ctx );
140+
auto n_vocab = llama_n_vocab( ctx ); // the size of the LLM vocabulary (in tokens)
141+
142+
std::vector<llama_token_data> candidates;
143+
candidates.reserve( n_vocab );
144+
145+
for( llama_token token_id = 0 ; token_id < n_vocab ; token_id++ )
146+
{
147+
candidates.emplace_back( llama_token_data{ token_id , logits[ token_id ] , 0.0f } );
148+
}
149+
150+
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
151+
152+
// Select it using the "Greedy sampling" method :
153+
new_token_id = llama_sample_token_greedy( ctx , &candidates_p );
154+
155+
156+
// is it an end of stream ?
157+
if ( new_token_id == llama_token_eos() )
158+
{
159+
fprintf(stderr, " [end of text]\n");
160+
break;
161+
}
162+
163+
// Print the new token :
164+
printf( "%s" , llama_token_to_str( ctx , new_token_id ) );
165+
fflush( stdout );
166+
167+
// Push this new token for next evaluation :
168+
tokens_list.push_back( new_token_id );
169+
170+
} // wend of main loop
171+
172+
llama_free( ctx );
173+
174+
return 0;
175+
}
176+
177+
// EOF

0 commit comments

Comments
 (0)