Forráskód Böngészése

Merge pull request #62 from guillermoscript/multiple-files

Add gpt-tokenizer package and implement size and token limits
Marcelo Cardoso 1 éve
szülő
commit
9d536ec40a
5 módosított fájl, 1687 hozzáadás és 1044 törlés
  1. 5 0
      README.md
  2. 1608 1038
      package-lock.json
  3. 1 0
      package.json
  4. 9 0
      src/config.ts
  5. 64 6
      src/core.ts

A különbségek nem kerülnek megjelenítésre, a fájl túl nagy
+ 5 - 0
README.md


A különbségek nem kerülnek megjelenítésre, a fájl túl nagy
+ 1608 - 1038
package-lock.json


+ 1 - 0
package.json

@@ -11,6 +11,7 @@
     "crawlee": "^3.0.0",
     "cross-env": "^7.0.3",
     "glob": "^10.3.10",
+    "gpt-tokenizer": "^2.1.2",
     "inquirer": "^9.2.12",
     "playwright": "*",
     "zod": "^3.22.4"

A különbségek nem kerülnek megjelenítésre, a fájl túl nagy
+ 9 - 0
src/config.ts


+ 64 - 6
src/core.ts

@@ -4,6 +4,7 @@ import { readFile, writeFile } from "fs/promises";
 import { glob } from "glob";
 import { Config, configSchema } from "./config.js";
 import { Page } from "playwright";
+import { isWithinTokenLimit } from "gpt-tokenizer";
 
 let pageCounter = 0;
 
@@ -142,17 +143,74 @@ export async function crawl(config: Config) {
 }
 
 export async function write(config: Config) {
-  configSchema.parse(config);
-
   const jsonFiles = await glob("storage/datasets/default/*.json", {
     absolute: true,
   });
 
-  const results = [];
+  console.log(`Found ${jsonFiles.length} files to combine...`);
+
+  let currentResults: Record<string, any>[] = [];
+  let currentSize: number = 0;
+  let fileCounter: number = 1;
+  const maxBytes: number = config.maxFileSize
+    ? config.maxFileSize * 1024 * 1024
+    : Infinity;
+
+  const getStringByteSize = (str: string): number =>
+    Buffer.byteLength(str, "utf-8");
+
+  const nextFileName = (): string =>
+    `${config.outputFileName.replace(/\.json$/, "")}-${fileCounter}.json`;
+
+  const writeBatchToFile = async (): Promise<void> => {
+    await writeFile(nextFileName(), JSON.stringify(currentResults, null, 2));
+    console.log(`Wrote ${currentResults.length} items to ${nextFileName()}`);
+    currentResults = [];
+    currentSize = 0;
+    fileCounter++;
+  };
+
+  let estimatedTokens: number = 0;
+
+  const addContentOrSplit = async (
+    data: Record<string, any>,
+  ): Promise<void> => {
+    const contentString: string = JSON.stringify(data);
+    const tokenCount: number | false = isWithinTokenLimit(
+      contentString,
+      config.maxTokens || Infinity,
+    );
+
+    if (typeof tokenCount === "number") {
+      if (estimatedTokens + tokenCount > config.maxTokens!) {
+        // Only write the batch if it's not empty (something to write)
+        if (currentResults.length > 0) {
+          await writeBatchToFile();
+        }
+        // Since the addition of a single item exceeded the token limit, halve it.
+        estimatedTokens = Math.floor(tokenCount / 2);
+        currentResults.push(data);
+      } else {
+        currentResults.push(data);
+        estimatedTokens += tokenCount;
+      }
+    }
+
+    currentSize += getStringByteSize(contentString);
+    if (currentSize > maxBytes) {
+      await writeBatchToFile();
+    }
+  };
+
+  // Iterate over each JSON file and process its contents.
   for (const file of jsonFiles) {
-    const data = JSON.parse(await readFile(file, "utf-8"));
-    results.push(data);
+    const fileContent = await readFile(file, "utf-8");
+    const data: Record<string, any> = JSON.parse(fileContent);
+    await addContentOrSplit(data);
   }
 
-  await writeFile(config.outputFileName, JSON.stringify(results, null, 2));
+  // Check if any remaining data needs to be written to a file.
+  if (currentResults.length > 0) {
+    await writeBatchToFile();
+  }
 }