/* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #ifndef LLAVA_NO_TORCH_DUMMY_IMAGE #include #else #include // std::fill #endif #if defined(ET_USE_THREADPOOL) #include #include #endif DEFINE_string( model_path, "llava.pte", "Model serialized in flatbuffer format."); DEFINE_string(tokenizer_path, "tokenizer.bin", "Tokenizer stuff."); DEFINE_string(prompt, "The answer to the ultimate question is", "Prompt."); DEFINE_string( image_path, "", "The path to a .pt file, a serialized torch tensor for an image, longest edge resized to 336."); DEFINE_double( temperature, 0.8f, "Temperature; Default is 0.8f. 0 = greedy argmax sampling (deterministic). Lower temperature = more deterministic"); DEFINE_int32( seq_len, 1024, "Total number of tokens to generate (prompt + output). Defaults to max_seq_len. If the number of input tokens + seq_len > max_seq_len, the output will be truncated to max_seq_len tokens."); DEFINE_int32( cpu_threads, -1, "Number of CPU threads for inference. Defaults to -1, which implies we'll use a heuristic to derive the # of performant cores for a specific device."); using executorch::extension::llm::Image; int32_t main(int32_t argc, char** argv) { gflags::ParseCommandLineFlags(&argc, &argv, true); // Create a loader to get the data of the program file. There are other // DataLoaders that use mmap() or point32_t to data that's already in memory, // and users can create their own DataLoaders to load from arbitrary sources. const char* model_path = FLAGS_model_path.c_str(); const char* tokenizer_path = FLAGS_tokenizer_path.c_str(); const char* prompt = FLAGS_prompt.c_str(); std::string image_path = FLAGS_image_path; double temperature = FLAGS_temperature; int32_t seq_len = FLAGS_seq_len; int32_t cpu_threads = FLAGS_cpu_threads; #if defined(ET_USE_THREADPOOL) uint32_t num_performant_cores = cpu_threads == -1 ? ::executorch::extension::cpuinfo::get_num_performant_cores() : static_cast(cpu_threads); ET_LOG( Info, "Resetting threadpool with num threads = %d", num_performant_cores); if (num_performant_cores > 0) { ::executorch::extension::threadpool::get_threadpool() ->_unsafe_reset_threadpool(num_performant_cores); } #endif // create llama runner example::LlavaRunner runner(model_path, tokenizer_path, temperature); // read image and resize the longest edge to 336 std::vector image_data; #ifdef LLAVA_NO_TORCH_DUMMY_IMAGE // Work without torch using a random data image_data.resize(3 * 240 * 336); std::fill(image_data.begin(), image_data.end(), 0); // black std::array image_shape = {3, 240, 336}; std::vector images = { {.data = image_data, .width = image_shape[2], .height = image_shape[1]}}; #else // LLAVA_NO_TORCH_DUMMY_IMAGE // cv::Mat image = cv::imread(image_path, cv::IMREAD_COLOR); // int longest_edge = std::max(image.rows, image.cols); // float scale_factor = 336.0f / longest_edge; // cv::Size new_size(image.cols * scale_factor, image.rows * scale_factor); // cv::Mat resized_image; // cv::resize(image, resized_image, new_size); // image_data.assign(resized_image.datastart, resized_image.dataend); torch::Tensor image_tensor; torch::load(image_tensor, image_path); // CHW ET_LOG( Info, "image size(0): %" PRId64 ", size(1): %" PRId64 ", size(2): %" PRId64, image_tensor.size(0), image_tensor.size(1), image_tensor.size(2)); image_data.assign( image_tensor.data_ptr(), image_tensor.data_ptr() + image_tensor.numel()); std::vector images = { {.data = image_data, .width = static_cast(image_tensor.size(2)), .height = static_cast(image_tensor.size(1))}}; #endif // LLAVA_NO_TORCH_DUMMY_IMAGE // generate runner.generate(std::move(images), prompt, seq_len); return 0; }