Skip to content

Commit

Permalink
Vraspar/phi 3 ios update (#467)
Browse files Browse the repository at this point in the history
* Update Phi-3 iOS build instructions

* Refactor chat UI and token generation logic, and handle token generation errors

* Refactor GenAIGenerator to use instance method for token generation


Co-authored-by: Edward Chen <[email protected]>

---------

Co-authored-by: Edward Chen <[email protected]>
  • Loading branch information
vraspar and edgchen1 authored Oct 14, 2024
1 parent 5f46590 commit 976cdf5
Show file tree
Hide file tree
Showing 6 changed files with 273 additions and 40 deletions.
143 changes: 133 additions & 10 deletions mobile/examples/phi-3/ios/LocalLLM/LocalLLM/ContentView.swift
Original file line number Diff line number Diff line change
Expand Up @@ -3,25 +3,143 @@

import SwiftUI


struct Message: Identifiable {
let id = UUID()
var text: String
let isUser: Bool
}

struct ContentView: View {
@ObservedObject var tokenUpdater = SharedTokenUpdater.shared
@State private var userInput: String = ""
@State private var messages: [Message] = [] // Store chat messages locally
@State private var isGenerating: Bool = false // Track token generation state
@State private var stats: String = "" // token generation stats
@State private var showAlert: Bool = false
@State private var errorMessage: String = ""

private let generator = GenAIGenerator()

var body: some View {
VStack {
// ChatBubbles
ScrollView {
VStack(alignment: .leading) {
ForEach(tokenUpdater.decodedTokens, id: \.self) { token in
Text(token)
.padding(.horizontal, 5)
VStack(alignment: .leading, spacing: 20) {
ForEach(messages) { message in
ChatBubble(text: message.text, isUser: message.isUser)
.padding(.horizontal, 20)
}
if !stats.isEmpty {
Text(stats)
.font(.footnote)
.foregroundColor(.gray)
.padding(.horizontal, 20)
.padding(.top, 5)
.multilineTextAlignment(.center)
}
}
.padding()
.padding(.top, 20)
}
Button("Generate Tokens") {
DispatchQueue.global(qos: .background).async {
// TODO: add user prompt question UI
GenAIGenerator.generate("Who is the current US president?");


// User input
HStack {
TextField("Type your message...", text: $userInput)
.padding()
.background(Color(.systemGray6))
.cornerRadius(20)
.padding(.horizontal)

Button(action: {
// Check for non-empty input
guard !userInput.trimmingCharacters(in: .whitespaces).isEmpty else { return }

messages.append(Message(text: userInput, isUser: true))
messages.append(Message(text: "", isUser: false)) // Placeholder for AI response


// clear previously generated tokens
SharedTokenUpdater.shared.clearTokens()

let prompt = userInput
userInput = ""
isGenerating = true


DispatchQueue.global(qos: .background).async {
generator.generate(prompt)
}
}) {
Image(systemName: "paperplane.fill")
.foregroundColor(.white)
.padding()
.background(isGenerating ? Color.gray : Color.pastelGreen)
.clipShape(Circle())
.padding(.trailing, 10)
}
.disabled(isGenerating)
}
.padding(.bottom, 20)
}
.background(Color(.systemGroupedBackground))
.edgesIgnoringSafeArea(.bottom)
.onReceive(NotificationCenter.default.publisher(for: NSNotification.Name("TokenGenerationCompleted"))) { _ in
isGenerating = false // Re-enable the button when token generation is complete
}
.onReceive(SharedTokenUpdater.shared.$decodedTokens) { tokens in
// update model response
if let lastIndex = messages.lastIndex(where: { !$0.isUser }) {
let combinedText = tokens.joined(separator: "")
messages[lastIndex].text = combinedText
}
}
.onReceive(NotificationCenter.default.publisher(for: NSNotification.Name("TokenGenerationStats"))) { notification in
if let userInfo = notification.userInfo,
let promptProcRate = userInfo["promptProcRate"] as? Double,
let tokenGenRate = userInfo["tokenGenRate"] as? Double {
stats = String(format: "Token generation rate: %.2f tokens/s. Prompt processing rate: %.2f tokens/s", tokenGenRate, promptProcRate)
}
}
.onReceive(NotificationCenter.default.publisher(for: NSNotification.Name("TokenGenerationError"))) { notification in
if let userInfo = notification.userInfo, let error = userInfo["error"] as? String {
errorMessage = error
isGenerating = false
showAlert = true
}
}
.alert(isPresented: $showAlert) {
Alert(
title: Text("Error"),
message: Text(errorMessage),
dismissButton: .default(Text("OK"))
)
}

}
}

struct ChatBubble: View {
var text: String
var isUser: Bool

var body: some View {
HStack {
if isUser {
Spacer()
Text(text)
.padding()
.background(Color.pastelGreen)
.foregroundColor(.white)
.cornerRadius(25)
.padding(.horizontal, 10)
} else {
Text(text)
.padding()
.background(Color(.systemGray5))
.foregroundColor(.black)
.cornerRadius(25)
.padding(.horizontal, 10)
Spacer()
}
}
}
Expand All @@ -32,3 +150,8 @@ struct ContentView_Previews: PreviewProvider {
ContentView()
}
}

// Extension for a pastel green color
extension Color {
static let pastelGreen = Color(red: 0.6, green: 0.9, blue: 0.6)
}
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ NS_ASSUME_NONNULL_BEGIN

@interface GenAIGenerator : NSObject

+ (void)generate:(NSString *)input_user_question;
- (void)generate:(NSString *)input_user_question;

@end

Expand Down
154 changes: 128 additions & 26 deletions mobile/examples/phi-3/ios/LocalLLM/LocalLLM/GenAIGenerator.mm
Original file line number Diff line number Diff line change
@@ -1,49 +1,151 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

#import "GenAIGenerator.h"
#include <chrono>
#include <vector>
#include "LocalLLM-Swift.h"
#include "ort_genai.h"
#include "ort_genai_c.h"


const size_t kMaxTokens = 200;

@interface GenAIGenerator () {
std::unique_ptr<OgaModel> model;
std::unique_ptr<OgaTokenizer> tokenizer;
}
@end

@implementation GenAIGenerator

+ (void)generate:(nonnull NSString*)input_user_question {
NSString* llmPath = [[NSBundle mainBundle] resourcePath];
const char* modelPath = llmPath.cString;
typedef std::chrono::steady_clock Clock;
typedef std::chrono::time_point<Clock> TimePoint;

- (instancetype)init {
self = [super init];
if (self) {
self->model = nullptr;
self->tokenizer = nullptr;
}
return self;
}

- (void)generate:(nonnull NSString*)input_user_question {
std::vector<long long> tokenTimes; // per-token generation times
tokenTimes.reserve(kMaxTokens);

TimePoint startTime, firstTokenTime, tokenStartTime;

try {
NSLog(@"Starting token generation...");

if (!self->model) {
NSLog(@"Creating model...");
NSString* llmPath = [[NSBundle mainBundle] resourcePath];
const char* modelPath = llmPath.cString;
self->model = OgaModel::Create(modelPath); // throws exception
}

if (!self->tokenizer) {
NSLog(@"Creating tokenizer...");
self->tokenizer = OgaTokenizer::Create(*self->model); // throws exception
}

auto tokenizer_stream = OgaTokenizerStream::Create(*self->tokenizer);

auto model = OgaModel::Create(modelPath);
auto tokenizer = OgaTokenizer::Create(*model);
// Construct the prompt
NSString* promptString = [NSString stringWithFormat:@"<|user|>\n%@<|end|>\n<|assistant|>", input_user_question];
const char* prompt = [promptString UTF8String];

NSString* promptString = [NSString stringWithFormat:@"<|user|>\n%@<|end|>\n<|assistant|>", input_user_question];
const char* prompt = [promptString UTF8String];
// Encode the prompt
auto sequences = OgaSequences::Create();
self->tokenizer->Encode(prompt, *sequences);

auto sequences = OgaSequences::Create();
tokenizer->Encode(prompt, *sequences);
size_t promptTokensCount = sequences->SequenceCount(0);

auto params = OgaGeneratorParams::Create(*model);
params->SetSearchOption("max_length", 200);
params->SetInputSequences(*sequences);
NSLog(@"Setting generator parameters...");
auto params = OgaGeneratorParams::Create(*self->model);
params->SetSearchOption("max_length", kMaxTokens);
params->SetInputSequences(*sequences);

// Streaming Output to generate token by token
auto tokenizer_stream = OgaTokenizerStream::Create(*tokenizer);
auto generator = OgaGenerator::Create(*self->model, *params);

auto generator = OgaGenerator::Create(*model, *params);
bool isFirstToken = true;
NSLog(@"Starting token generation loop...");

while (!generator->IsDone()) {
generator->ComputeLogits();
generator->GenerateNextToken();
startTime = Clock::now();
while (!generator->IsDone()) {
tokenStartTime = Clock::now();

const int32_t* seq = generator->GetSequenceData(0);
size_t seq_len = generator->GetSequenceCount(0);
const char* decode_tokens = tokenizer_stream->Decode(seq[seq_len - 1]);
generator->ComputeLogits();
generator->GenerateNextToken();

NSLog(@"Decoded tokens: %s", decode_tokens);
if (isFirstToken) {
firstTokenTime = Clock::now();
isFirstToken = false;
}

// Add decoded token to SharedTokenUpdater
NSString* decodedTokenString = [NSString stringWithUTF8String:decode_tokens];
[SharedTokenUpdater.shared addDecodedToken:decodedTokenString];
// Get the sequence data and decode the token
const int32_t* seq = generator->GetSequenceData(0);
size_t seq_len = generator->GetSequenceCount(0);
const char* decode_tokens = tokenizer_stream->Decode(seq[seq_len - 1]);

if (!decode_tokens) {
throw std::runtime_error("Token decoding failed.");
}

// Measure token generation time excluding logging
TimePoint tokenEndTime = Clock::now();
auto tokenDuration = std::chrono::duration_cast<std::chrono::milliseconds>(tokenEndTime - tokenStartTime).count();
tokenTimes.push_back(tokenDuration);
NSString* decodedTokenString = [NSString stringWithUTF8String:decode_tokens];
[SharedTokenUpdater.shared addDecodedToken:decodedTokenString];
}

TimePoint endTime = Clock::now();
// Log token times
NSLog(@"Per-token generation times: %@", [self formatTokenTimes:tokenTimes]);

// Calculate metrics
auto totalDuration = std::chrono::duration_cast<std::chrono::milliseconds>(endTime - startTime).count();
auto firstTokenDuration = std::chrono::duration_cast<std::chrono::milliseconds>(firstTokenTime - startTime).count();

double promptProcRate = (double)promptTokensCount * 1000.0 / firstTokenDuration;
double tokenGenRate = (double)(tokenTimes.size() - 1) * 1000.0 / (totalDuration - firstTokenDuration);

NSLog(@"Token generation completed. Total time: %lld ms, First token time: %lld ms, Total tokens: %zu",
totalDuration, firstTokenDuration, tokenTimes.size());
NSLog(@"Prompt tokens: %zu, Prompt Processing Rate: %f tokens/s", promptTokensCount, promptProcRate);
NSLog(@"Generated tokens: %zu, Token Generation Rate: %f tokens/s", tokenTimes.size(), tokenGenRate);

NSDictionary* stats = @{@"tokenGenRate" : @(tokenGenRate), @"promptProcRate" : @(promptProcRate)};
// notify main thread that token generation is complete
dispatch_async(dispatch_get_main_queue(), ^{
[[NSNotificationCenter defaultCenter] postNotificationName:@"TokenGenerationStats" object:nil userInfo:stats];
[[NSNotificationCenter defaultCenter] postNotificationName:@"TokenGenerationCompleted" object:nil];
});

NSLog(@"Token generation completed.");

} catch (const std::exception& e) {
NSString* errorMessage = [NSString stringWithUTF8String:e.what()];
NSLog(@"Error during generation: %@", errorMessage);

// Send error to the UI
NSDictionary* errorInfo = @{@"error" : errorMessage};
dispatch_async(dispatch_get_main_queue(), ^{
[[NSNotificationCenter defaultCenter] postNotificationName:@"TokenGenerationError" object:nil userInfo:errorInfo];
});
}
}

// Utility function to format token times for logging
- (NSString*)formatTokenTimes:(const std::vector<long long>&)tokenTimes {
NSMutableString* formattedTimes = [NSMutableString string];
for (size_t i = 0; i < tokenTimes.size(); i++) {
[formattedTimes appendFormat:@"%lld ms, ", tokenTimes[i]];
}
return [formattedTimes copy];
}

@end
8 changes: 5 additions & 3 deletions mobile/examples/phi-3/ios/LocalLLM/LocalLLM/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ git clone https://github.com/microsoft/onnxruntime-genai

cd onnxruntime-genai

python3 build.py --parallel --build_dir ./build_iphoneos --ios --ios_sysroot iphoneos --ios_arch arm64 --ios_deployment_target 16.6 --cmake_generator Xcode
python3 build.py --parallel --build_dir ./build_iphoneos --ios --apple_sysroot iphoneos --osx_arch arm64 --apple_deploy_target 16.6 --cmake_generator Xcode

```

Expand Down Expand Up @@ -98,12 +98,14 @@ The app uses Objective-C/C++ since using Generative AI with ONNX Runtime C++ API

Download from hf repo: <https://huggingface.co/microsoft/Phi-3-mini-128k-instruct-onnx/tree/main/cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4>

After downloading completes, you need to copy files over to the `Resources` directory in the `Destination` column of `Target-LocalLLM`->`Build Phases`-> `New Copy File Phases` -> `Copy Files`.
After downloading the files, Click on `LocalLLM` project from sidebar, go to `Targets > LocalLLM > Build Phases`. Find the Copy Files section, set the Destination to Resources, and add the downloaded files.

Upon app launching, Xcode will automatically copy and install the model files from Resources folder and directly download to the iOS device.

### 4. Run the app and checkout the streaming output token results

**Note**: The current app only sets up with a simple initial prompt question, you can adjust/try your own or refine the UI based on requirements.

***Notice:*** The current Xcode project runs on iOS 16.6, feel free to adjust latest iOS/build for lates iOS versions accordingly.
***Notice:*** The current Xcode project runs on iOS 16.6, feel free to adjust latest iOS/build for lates iOS versions accordingly.

![alt text](<Simulator Screenshot - iPhone 16.png>)
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,10 @@ import Foundation
self.decodedTokens.append(token)
}
}

@objc func clearTokens() {
DispatchQueue.main.async {
self.decodedTokens.removeAll()
}
}
}
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.

0 comments on commit 976cdf5

Please sign in to comment.