Skip to content

Commit

Permalink
Allow the token run length to be changed.
Browse files Browse the repository at this point in the history
This lets library users make the tradeoff between memory use and throughput.

See #45.
  • Loading branch information
ianh committed Jan 8, 2025
1 parent a3470af commit dba6335
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 7 deletions.
6 changes: 6 additions & 0 deletions src/6a-generate.c
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#define BRACKET_SYMBOL_TOKEN %%bracket-symbol-token
#define COMMENT_TOKEN %%comment-token
#define OWL_DONT_INLINE OWL_DONT_INLINE
#define OWL_TOKEN_RUN_LENGTH OWL_TOKEN_RUN_LENGTH
#define TOKENIZE_BODY(...) static const char *tokenizer_source = EVALUATE_MACROS_AND_STRINGIFY(__VA_ARGS__);
#include "x-tokenize.h"
#define FINISHED_NODE_T size_t
Expand Down Expand Up @@ -412,6 +413,11 @@ void generate(struct generator *gen)
output_line(out, "#define OWL_DONT_INLINE");
output_line(out, "#endif");

output_line(out, "// This can be overridden to reduce the amount Owl allocates at once.");
output_line(out, "#ifndef OWL_TOKEN_RUN_LENGTH");
output_line(out, "#define OWL_TOKEN_RUN_LENGTH 4096");
output_line(out, "#endif");

output_line(out, "");
output_line(out, "struct %%prefix_tree {");
output_line(out, " const char *string;");
Expand Down
16 changes: 9 additions & 7 deletions src/x-tokenize.h
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,9 @@
#endif
#endif

#define TOKEN_RUN_LENGTH 4096
#ifndef OWL_TOKEN_RUN_LENGTH
#define OWL_TOKEN_RUN_LENGTH 4096
#endif

TOKENIZE_BODY
(
Expand All @@ -118,9 +120,9 @@ struct owl_token_run {
struct owl_token_run *prev;
uint16_t number_of_tokens;
uint16_t lengths_size;
uint8_t lengths[TOKEN_RUN_LENGTH * 2];
TOKEN_T tokens[TOKEN_RUN_LENGTH];
STATE_T states[TOKEN_RUN_LENGTH];
uint8_t lengths[OWL_TOKEN_RUN_LENGTH * 2];
TOKEN_T tokens[OWL_TOKEN_RUN_LENGTH];
STATE_T states[OWL_TOKEN_RUN_LENGTH];
};

struct owl_default_tokenizer {
Expand Down Expand Up @@ -246,7 +248,7 @@ owl_default_tokenizer_advance(struct owl_default_tokenizer *tokenizer,
const char *text = tokenizer->text;
size_t whitespace = tokenizer->whitespace;
size_t offset = tokenizer->offset;
while (number_of_tokens < TOKEN_RUN_LENGTH) {
while (number_of_tokens < OWL_TOKEN_RUN_LENGTH) {
char c = text[offset];
if (c == '\0')
break;
Expand Down Expand Up @@ -384,7 +386,7 @@ owl_default_tokenizer_advance(struct owl_default_tokenizer *tokenizer,
free(run);
return false;
}
if (end_token && number_of_tokens + 1 >= TOKEN_RUN_LENGTH)
if (end_token && number_of_tokens + 1 >= OWL_TOKEN_RUN_LENGTH)
break;
if (!encode_token_length(run, &lengths_size, token_length, whitespace))
break;
Expand Down Expand Up @@ -433,7 +435,7 @@ owl_default_tokenizer_advance(struct owl_default_tokenizer *tokenizer,
number_of_tokens++;
offset += token_length;
if (end_token) {
assert(number_of_tokens < TOKEN_RUN_LENGTH);
assert(number_of_tokens < OWL_TOKEN_RUN_LENGTH);
run->tokens[number_of_tokens] = BRACKET_SYMBOL_TOKEN;
number_of_tokens++;
}
Expand Down

0 comments on commit dba6335

Please sign in to comment.