/* 1. Import transformers.js
You can import transformers.js directly from CDN,
import { pipeline } from 'https://cdn.jsdelivr.net/npm/@xenova/[email protected]';
Below dynamic import is to enable
import within the interactive shell.
You can edit the code block to replace
the import with the CDN import above.
const module = import('https://cdn.jsdelivr.net/npm/@xenova/[email protected]');
To download wasm modules from CDN, comment out the line
module.env.backends.onnx.wasm.wasmPaths = '/webapp/js/'
*/
async function loadTrasformers() {
try {
const module = await import('/webapp/js/transformers.min.js');
module.env.backends.onnx.wasm.wasmPaths = '/webapp/js/'
const pipeline = module.pipeline;
const extractor = await pipeline(
"feature-extraction",
"Xenova/jina-embeddings-v2-small-en",
{ quantized: true }
);
// hack to pass state between codapi code cells.
window.appState = {'extractor': extractor}
console.log('🔥 Transformers loaded!')
} catch (error) {
console.error('Error loading the module:', error);
}
}
await loadTrasformers();
/* 2. Create vector embeddings.*/
// a synthetic document.
const doc = `
A: "Climate change is leading to more unpredictable weather patterns, including prolonged droughts and unexpected frosts. These changes are forcing farmers to alter traditional planting schedules. Crops like wheat and maize are particularly vulnerable to these shifts, potentially leading to decreased yields and increased food prices."
B: "The impact of climate change on agriculture extends beyond just temperature changes. Increased CO2 levels can stimulate plant growth, but this is often offset by other limiting factors like water availability and nutrient levels. This complexity makes it difficult for farmers to predict crop behavior and plan accordingly."
C: "One of the indirect effects of climate change on agriculture is the increase in pest populations. Warmer temperatures can lead to more frequent and severe pest infestations, affecting crop health and yield. Farmers are increasingly turning to integrated pest management practices to combat this issue."
D: "Advancements in agricultural technology, such as drought-resistant crops and precision farming, are helping farmers adapt to the challenges posed by climate change. These innovations are crucial for ensuring food security in the face of changing environmental conditions."
`;
// naive splitting of the document into passages.
const chunks = doc.split("\n").filter((line) => line.trim().length > 0);
const extractorResult = await Promise.all(
chunks.map((chunk, i) => {
console.log(`🔥 processing chunk ${i + 1}/${chunks.length}`);
return window.appState.extractor(chunk, { pooling: "mean", normalize: true });
})
);
// saving the embeddings in the global appState object.
window.appState.extractorResult = extractorResult;
window.appState.chunks = chunks
/* 3. Load Voy search module.*/
async function loadVoy() {
try {
const Voy = await import('/webapp/js/voy_search_bg.js');
await WebAssembly.instantiateStreaming(fetch("/webapp/js/voy_search_bg.wasm"), {
"./voy_search_bg.js": Voy,
}).then((module) => {
Voy.__wbg_set_wasm(module.instance.exports);
console.log('🔥 Voy loaded!');
});
window.appState.Voy = Voy
} catch (error) {
console.error('Error loading Voy:', error);
}
}
await loadVoy();
index.serialize()
method and load it in the next run. We are not using local storage here as we are just demonstrating the use of Voy in the browser.
/* 4. Create a voy search index.*/
const records = window.appState.extractorResult.map((result, i) => ({
id: String(i),
title: window.appState.chunks[i],
url: `/path/${i}`,
embeddings: Array.from(result.data),
}));
const resource = { embeddings: records};
const index = new window.appState.Voy.Voy(resource);
window.appState.index = index
console.log(`🔥 Voy index created! Index size => ${window.appState.index.size()}`);
Consider the query: How is climate change impacting agricultural practices?
A and C appears to be relevant answers to the question to me. Do you agree? Does the cosine simialrity ranking agree?
/* 5. Semantic search without HYDE. */
const query = 'How is climate change impacting agricultural practices?';
const queryEmbedding = await window.appState.extractor(
query, {
pooling: "mean",
normalize: true,
});
const results = window.appState.index.search(queryEmbedding.data, 5);
results.neighbors.forEach((result) => {
console.log(`✨ ${result.title}`);
});
Get an answer from ChatGPT to the query. When asking ChatGPT don't mention our documents. You will get an answer based on GPT training data. Let's repeat the semeantic search by adding GPT's answer to the query.
/* 5. Semantic search with HYDE. */
const query = `How is climate change impacting agricultural practices?
Climate change is affecting agricultural practices primarily through alterations in weather patterns, which impact crop growth and harvest timings.`;
const queryEmbedding = await window.appState.extractor(
query, {
pooling: "mean",
normalize: true,
});
const results = window.appState.index.search(queryEmbedding.data, 5);
results.neighbors.forEach((result) => {
console.log(`✨ ${result.title}`);
});
Do you see a difference in ranking order?
When building QA agents for knowledge bases, I use HyDE strategy for both document indexing and retrieval stages. During indexing, generate QA pairs by asking a long context LLM to generate frequently asked questions from the given doc. Then index each of the FAQs as QA pairs. During retrieval use a query with hypothetical answer as in the example above.