99import java .util .LinkedList ;
1010import java .util .Objects ;
1111import java .util .function .Consumer ;
12- import java .util .stream .Collectors ;
1312
1413@ UtilityClass
1514public class Llama3Tokenizer {
@@ -48,39 +47,38 @@ public static String systemHeader() {
4847 return Llama3Tokenizer .header (ROLE_SYSTEM );
4948 }
5049
51- public static Llama3TextCompletion generateTextCompletion (@ NonNull TextConversation conversation ) {
52- final var textCompletionBuilder = new StringBuilder ();
53- textCompletionBuilder
54- .append (BEGIN_OF_TEXT )
55- .append (systemHeader ())
56- .append (conversation .getSystemMessage ());
50+ public static String tokenizeMessage (@ NonNull TextMessage message ) {
51+ final var messageBuilder = new StringBuilder ();
52+ addMessageToTextCompletion (messageBuilder ).accept (message );
53+ return messageBuilder .toString ();
54+ }
5755
56+ public static Llama3TextCompletion generateTextCompletion (@ NonNull TextConversation conversation ) {
57+ final var textCompletionBuilder = createBeginOfText (conversation .getSystemMessage ());
5858 conversation .getMessages ().forEach (addMessageToTextCompletion (textCompletionBuilder ));
5959 final var textCompletion = textCompletionBuilder .toString ();
6060 return new Llama3TextCompletion (textCompletion );
6161 }
6262
6363 public static Integer approximateConversationContextSize (@ NonNull TextConversation conversation , @ Nullable Integer tokenSize ) {
64- final var conversationPlainText = conversation .getMessages ().stream ()
65- .map (TextMessage ::getText )
66- .collect (Collectors .joining (System .lineSeparator ()));
67-
68- return getTokens (conversationPlainText , tokenSize );
64+ final var conversationTextCompletion = generateTextCompletion (conversation );
65+ return getTokens (conversationTextCompletion .getText (), tokenSize );
6966 }
7067
7168 public static TextConversation fitToContextWindow (@ NonNull TextConversation conversation , @ Nullable Integer contextWindowSize ) {
7269 contextWindowSize = Objects .requireNonNullElse (contextWindowSize , DEFAULT_CONTEXT_WINDOW_SIZE );
7370
74- final var systemMessage = conversation .getSystemMessage ();
75- final var systemMessageTokens = getTokens (systemMessage , null );
71+ final var systemMessage = createBeginOfText ( conversation .getSystemMessage () );
72+ final var systemMessageTokens = getTokens (systemMessage . toString () , null );
7673 int remainingTokens = contextWindowSize - systemMessageTokens ;
7774
7875 final var messages = conversation .getMessages ();
7976 final var fittedMessages = new LinkedList <TextMessage >();
8077
8178 for (int i = messages .size () - 1 ; i >= 0 ; i --) {
8279 final var message = messages .get (i );
83- final var messageTokens = getTokens (message .getText (), null );
80+ final var tokenizedMessage = tokenizeMessage (message );
81+ final var messageTokens = getTokens (tokenizedMessage , null );
8482
8583 if (remainingTokens - messageTokens >= 0 ) {
8684 fittedMessages .addFirst (message );
@@ -90,7 +88,7 @@ public static TextConversation fitToContextWindow(@NonNull TextConversation conv
9088 }
9189 }
9290
93- return new TextConversation (systemMessage , fittedMessages );
91+ return new TextConversation (conversation . getSystemMessage () , fittedMessages );
9492 }
9593
9694 Consumer <TextMessage > addMessageToTextCompletion (@ NonNull StringBuilder textCompletionBuilder ) {
@@ -100,8 +98,14 @@ Consumer<TextMessage> addMessageToTextCompletion(@NonNull StringBuilder textComp
10098 .append (END_OF_TEXT_ID );
10199 }
102100
103- private Integer getTokens (@ NonNull String string , @ Nullable Integer tokenSize ) {
101+ Integer getTokens (@ NonNull String string , @ Nullable Integer tokenSize ) {
104102 tokenSize = Objects .requireNonNullElse (tokenSize , APPROXIMATE_CHARACTERS_PER_TOKEN );
105103 return string .length () / tokenSize ;
106104 }
105+
106+ StringBuilder createBeginOfText (@ NonNull String systemMessage ) {
107+ return new StringBuilder ().append (BEGIN_OF_TEXT )
108+ .append (systemHeader ())
109+ .append (systemMessage );
110+ }
107111}
0 commit comments