BLING Sheared Llama 1.3b 0.1 for Web-LLM q4f32_1
This is a compiled version of bling-sheared-llama-1.3b-0.1 for MLC Web-LLM, using q4f32_1 quantization.
Usage
import * as webLLM from "@mlc-ai/web-llm";
const modelId = "bling-sheared-llama-1.3b-0.1-q4f32_1";
const appConfig = {
model_list: [
{
model_url:
"https://huggingface.co/Felladrin/mlc-chat-bling-sheared-llama-1.3b-0.1-q4f32_1/resolve/main/params/",
local_id: modelId,
},
],
model_lib_map: {
[modelId]:
"https://huggingface.co/Felladrin/mlc-chat-bling-sheared-llama-1.3b-0.1-q4f32_1/resolve/main/bling-sheared-llama-1.3b-0.1-q4f32_1-webgpu.wasm",
},
};
const chatConfig = {
temperature: 0.7,
repetition_penalty: 1.2,
top_p: 0.95
};
async function main() {
const chat = new webLLM.ChatModule();
await chat.reload(modelId, chatConfig, appConfig);
let lastResponse = "";
const generateProgressCallback = (_, message = "") => {
if (message.length === 0) return chat.interruptGenerate();
lastResponse = message;
console.log(`Partial response: ${lastResponse}`);
};
const fistPrompt = "Hi! I have a question.";
await chat.generate(fistPrompt, generateProgressCallback);
console.log(`Complete response: ${lastResponse}`);
const secondPrompt = "Who is Alan Turing?";
await chat.generate(secondPrompt, generateProgressCallback);
console.log(`Complete response: ${lastResponse}`);
console.info(await chat.runtimeStatsText());
}
main();
Inference Providers
NEW
This model isn't deployed by any Inference Provider.
๐
Ask for provider support