Skip to content

Commit

Permalink
Perf improvements for VSCODE schema gen (#455)
Browse files Browse the repository at this point in the history
- Ability to handle larger number of schema definitions
- Added option to generate only embeddings
- Fixed bugs encountered during processing workbench.actions
  • Loading branch information
pcdeadeasy authored Dec 4, 2024
1 parent 76e78ca commit 6d40b1a
Show file tree
Hide file tree
Showing 4 changed files with 258 additions and 53 deletions.
20 changes: 20 additions & 0 deletions ts/.vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,26 @@
"${workspaceFolder}/**/*.js"
]
},
{
"name": "VSCODE SchemaGen -genembeddings",
"type": "node",
"request": "launch",
"skipFiles": [
"<node_internals>/**"
],
"cwd": "${workspaceFolder}/examples/vscodeSchemaGen",
"program": "./dist/main.js",
"args": [ "-genembeddings",
"-schemaFile",
"",
"-actionPrefix",
"workbench.action"
],
"console": "externalTerminal",
"outFiles": [
"${workspaceFolder}/**/*.js"
]
},
{
"name": "VSCODE SchemaGen -statgen",
"type": "node",
Expand Down
72 changes: 49 additions & 23 deletions ts/examples/vscodeSchemaGen/src/genStats.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,27 +19,53 @@ export interface StatsResult {
stdDevScore: number;
}

function loadActionData(filePath: string): any[] {
const rawData = fs.readFileSync(filePath, "utf8").split("\n");
let data: any[] = [];
rawData.forEach((line, index) => {
if (line.trim() !== "") {
try {
data.push(JSON.parse(line));
} catch (error: any) {
console.error(`Error parsing JSON on line ${index + 1}:`, line);
console.error(`Error details: ${error.message}`);
export async function loadActionData(filePath: string): Promise<any[]> {
const readStream = fs.createReadStream(filePath, { encoding: "utf8" });
const results: any[] = [];
let leftover = "";

for await (const chunk of readStream) {
const lines = (leftover + chunk).split("\n");
leftover = lines.pop()!;

for (const line of lines) {
if (line.trim()) {
try {
const item = JSON.parse(line);
results.push({
...item,
embedding: new Float32Array(item.embedding),
requests: item.requests.map((request: any) => ({
...request,
embedding: new Float32Array(request.embedding),
})),
});
} catch (error: any) {
console.error(`Error parsing line: ${line}`);
console.error(`Error details: ${error.message}`);
}
}
}
});
return data.map((item: any) => ({
...item,
embedding: new Float32Array(item.embedding),
requests: item.requests.map((request: any) => ({
...request,
embedding: new Float32Array(request.embedding),
})),
}));
}

if (leftover.trim()) {
try {
const item = JSON.parse(leftover);
results.push({
...item,
embedding: new Float32Array(item.embedding),
requests: item.requests.map((request: any) => ({
...request,
embedding: new Float32Array(request.embedding),
})),
});
} catch (error: any) {
console.error(`Error parsing leftover: ${leftover}`);
console.error(`Error details: ${error.message}`);
}
}

return results;
}

function calcMean(values: number[]): number {
Expand Down Expand Up @@ -253,13 +279,13 @@ export function loadCommentsActionSchema(
return actionSchemaComments;
}

export function processActionSchemaAndReqData(
export async function processActionSchemaAndReqData(
actionreqEmbeddingsFile: string,
threshold: number = 0.7,
statsfile: string,
zerorankStatsFile: string | undefined,
) {
const data: any[] = loadActionData(actionreqEmbeddingsFile);
const data: any[] = await loadActionData(actionreqEmbeddingsFile);
const results: any[] = generateStats(data, threshold);
printDetailedMarkdownTable(
results,
Expand All @@ -268,14 +294,14 @@ export function processActionSchemaAndReqData(
);
}

export function processActionReqDataWithComments(
export async function processActionReqDataWithComments(
schemaFilePath: string,
actionreqEmbeddingsFile: string,
threshold: number = 0.7,
statsfile: string,
zerorankStatsFile: string | undefined,
) {
const data: any[] = loadActionData(actionreqEmbeddingsFile);
const data: any[] = await loadActionData(actionreqEmbeddingsFile);
const results: any[] = generateStats(data, threshold);
const actionSchemaComments = loadCommentsActionSchema(schemaFilePath);
printDetailedMarkdownTable(
Expand Down
175 changes: 155 additions & 20 deletions ts/examples/vscodeSchemaGen/src/schemaGen.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import * as path from "path";
import dotenv from "dotenv";
import * as fs from "fs";
import { finished } from "stream/promises";

import {
ChatModel,
Expand All @@ -12,7 +13,7 @@ import {
openai,
} from "aiclient";
import { generateActionRequests } from "./actionGen.js";
import { dedupeList, generateEmbedding, TypeSchema } from "typeagent";
import { dedupeList, generateEmbeddingWithRetry, TypeSchema } from "typeagent";

const envPath = new URL("../../../.env", import.meta.url);
dotenv.config({ path: envPath });
Expand All @@ -32,12 +33,30 @@ async function getModelCompletionResponse(
}
}

async function writeSchemaEmbeddingsDataToFile(
schemaData: any[],
outputPath: string,
): Promise<void> {
const writeStream = fs.createWriteStream(outputPath);
return new Promise((resolve, reject) => {
schemaData.forEach((item) => {
try {
writeStream.write(JSON.stringify(item) + "\n");
} catch (error) {
reject(error);
}
});
writeStream.end();
writeStream.on("finish", resolve);
writeStream.on("error", reject);
});
}

export async function createVSCODESchemaGen(
model: ChatModelWithStreaming,
jsonSchema: any,
) {
console.log("Generating VSCODE schema...");

model.complete("Generate VSCODE schema").then((response: any) => {
if (response.choices) {
const schema = response.choices[0].text;
Expand Down Expand Up @@ -139,12 +158,124 @@ export async function generateEmbeddingForActionsRequests(
const userRequestEmbeddings = await Promise.all(
actionRequests.map(async (request: string) => ({
request,
embedding: Array.from(await generateEmbedding(model, request)),
embedding: Array.from(
await generateEmbeddingWithRetry(model, request),
),
})),
);
return userRequestEmbeddings;
}

export async function genEmbeddingDataFromActionSchema(
model: ChatModel,
jsonFilePath: string,
schemaFilePath: string,
actionPrefix: string | undefined,
output_dir: string,
maxNodestoProcess: number = -1,
) {
if (fs.existsSync(schemaFilePath)) {
const schema = fs.readFileSync(schemaFilePath, "utf8");
const schemaLines = schema.split("\n\n");
const schemaDefinitions: string[] = [];
for (const line of schemaLines) {
schemaDefinitions.push(line);
}

const embeddingModel = openai.createEmbeddingModel();
let aggrData: any = [];
let processedNodeCount = 0;

for (const schemaStr of schemaDefinitions) {
let actionSchemaData: any = parseTypeComponents(schemaStr);
const actionString: string = `${actionSchemaData.typeName} ${actionSchemaData.actionName} ${actionSchemaData.comments.join(" ")}`;
let actionEmbedding: Float32Array =
await generateEmbeddingWithRetry(
embeddingModel,
JSON.stringify(actionString),
);

let typeSchema: TypeSchema = {
typeName: actionSchemaData.typeName,
schemaText: schemaStr,
};

let actionRequests: string[] = await generateActionRequests(
"variations",
model,
typeSchema,
actionSchemaData.comments.join(" "),
25,
);

actionRequests = dedupeList(actionRequests);
actionRequests.sort();

let actionReqEmbeddings = await generateEmbeddingForActionsRequests(
embeddingModel,
actionRequests,
);

aggrData.push({
...actionSchemaData,
schema: schemaStr,
embedding: Array.from(actionEmbedding),
requests: actionReqEmbeddings,
});

processedNodeCount++;

if (processedNodeCount % 50 === 0) {
console.log(
`Processed ${processedNodeCount} schema definitions so far.`,
);
}
}

if (aggrData.length > 0) {
const jsonlFileName =
actionPrefix !== undefined && actionPrefix.length > 0
? path.join(
output_dir,
"aggr_data_[" + actionPrefix + "].jsonl",
)
: path.join(output_dir, "aggr_data.jsonl");
writeSchemaEmbeddingsDataToFile(aggrData, jsonlFileName);
console.log(
`Aggregate action and request data file: ${jsonlFileName}`,
);
}
console.log(
`Total action schema definitions processed: ${processedNodeCount}`,
);
} else {
console.log(`Schema file not found: ${schemaFilePath}`);
}
}

async function persistSchemaDefinitions(
schemaFilePath: string,
schemaDefinitions: string[],
processedNodeCount: number,
schemaCount: number,
): Promise<void> {
const writeStream = fs.createWriteStream(schemaFilePath, {
encoding: "utf8",
});
for (const definition of schemaDefinitions) {
if (!writeStream.write(`${definition}\n\n`)) {
await new Promise((resolve) => writeStream.once("drain", resolve));
}
}
writeStream.end();
await finished(writeStream);

console.log(`Schema definitions file: ${schemaFilePath}`);
console.log(
`Total nodes processed: ${processedNodeCount}, Total schemas generated: ${schemaCount}`,
);
}

export async function processVscodeCommandsJsonFile(
model: ChatModel,
jsonFilePath: string,
Expand Down Expand Up @@ -193,10 +324,11 @@ export async function processVscodeCommandsJsonFile(

let actionSchemaData: any = parseTypeComponents(schemaStr);
const actionString: string = `${actionSchemaData.typeName} ${actionSchemaData.actionName} ${actionSchemaData.comments.join(" ")}`;
let actionEmbedding: Float32Array = await generateEmbedding(
embeddingModel,
JSON.stringify(actionString),
);
let actionEmbedding: Float32Array =
await generateEmbeddingWithRetry(
embeddingModel,
JSON.stringify(actionString),
);

let typeSchema: TypeSchema = {
typeName: actionSchemaData.typeName,
Expand Down Expand Up @@ -248,19 +380,22 @@ export async function processVscodeCommandsJsonFile(
}
}

fs.writeFileSync(schemaFilePath, schemaDefinitions.join("\n\n"));
console.log(`Schema definitions file: ${schemaFilePath}`);
console.log(
`Total nodes processed: ${processedNodeCount}, Total schemas generated: ${schemaCount}`,
persistSchemaDefinitions(
schemaFilePath,
schemaDefinitions,
processedNodeCount,
schemaCount,
);

const jsonlData = aggrData
.map((item: any) => JSON.stringify(item))
.join("\n");
const jsonlFileName = path.join(
output_dir,
"aggr_data_[" + actionPrefix + "].jsonl",
);
fs.writeFileSync(jsonlFileName, jsonlData);
console.log(`Aggregate action and request data file: ${jsonlFileName}`);
if (aggrData.length > 0) {
const jsonlFileName =
actionPrefix !== undefined && actionPrefix.length > 0
? path.join(
output_dir,
"aggr_data_[" + actionPrefix + "].jsonl",
)
: path.join(output_dir, "aggr_data.jsonl");
writeSchemaEmbeddingsDataToFile(aggrData, jsonlFileName);
console.log(`Aggregate action and request data file: ${jsonlFileName}`);
}
}
Loading

0 comments on commit 6d40b1a

Please sign in to comment.