пре 2 година · 9d536ec40a
--- a/README.md
+++ b/README.md
--- a/package-lock.json
+++ b/package-lock.json
--- a/package.json
+++ b/package.json
@@ -11,6 +11,7 @@
 
				     "crawlee": "^3.0.0",
			
 
				     "cross-env": "^7.0.3",
			
 
				     "glob": "^10.3.10",
			
 
				+    "gpt-tokenizer": "^2.1.2",
			
 
				     "inquirer": "^9.2.12",
			
 
				     "playwright": "*",
			
 
				     "zod": "^3.22.4"
			
--- a/src/config.ts
+++ b/src/config.ts
--- a/src/core.ts
+++ b/src/core.ts
@@ -4,6 +4,7 @@ import { readFile, writeFile } from "fs/promises";
 
				 import { glob } from "glob";
			
 
				 import { Config, configSchema } from "./config.js";
			
 
				 import { Page } from "playwright";
			
 
				+import { isWithinTokenLimit } from "gpt-tokenizer";
			
 
				 
			
 
				 let pageCounter = 0;
			
 
				 
			
@@ -142,17 +143,74 @@ export async function crawl(config: Config) {
 
				 }
			
 
				 
			
 
				 export async function write(config: Config) {
			
 
				-  configSchema.parse(config);
			
 
				-
			
 
				   const jsonFiles = await glob("storage/datasets/default/*.json", {
			
 
				     absolute: true,
			
 
				   });
			
 
				 
			
 
				-  const results = [];
			
 
				+  console.log(`Found ${jsonFiles.length} files to combine...`);
			
 
				+
			
 
				+  let currentResults: Record<string, any>[] = [];
			
 
				+  let currentSize: number = 0;
			
 
				+  let fileCounter: number = 1;
			
 
				+  const maxBytes: number = config.maxFileSize
			
 
				+    ? config.maxFileSize * 1024 * 1024
			
 
				+    : Infinity;
			
 
				+
			
 
				+  const getStringByteSize = (str: string): number =>
			
 
				+    Buffer.byteLength(str, "utf-8");
			
 
				+
			
 
				+  const nextFileName = (): string =>
			
 
				+    `${config.outputFileName.replace(/\.json$/, "")}-${fileCounter}.json`;
			
 
				+
			
 
				+  const writeBatchToFile = async (): Promise<void> => {
			
 
				+    await writeFile(nextFileName(), JSON.stringify(currentResults, null, 2));
			
 
				+    console.log(`Wrote ${currentResults.length} items to ${nextFileName()}`);
			
 
				+    currentResults = [];
			
 
				+    currentSize = 0;
			
 
				+    fileCounter++;
			
 
				+  };
			
 
				+
			
 
				+  let estimatedTokens: number = 0;
			
 
				+
			
 
				+  const addContentOrSplit = async (
			
 
				+    data: Record<string, any>,
			
 
				+  ): Promise<void> => {
			
 
				+    const contentString: string = JSON.stringify(data);
			
 
				+    const tokenCount: number | false = isWithinTokenLimit(
			
 
				+      contentString,
			
 
				+      config.maxTokens || Infinity,
			
 
				+    );
			
 
				+
			
 
				+    if (typeof tokenCount === "number") {
			
 
				+      if (estimatedTokens + tokenCount > config.maxTokens!) {
			
 
				+        // Only write the batch if it's not empty (something to write)
			
 
				+        if (currentResults.length > 0) {
			
 
				+          await writeBatchToFile();
			
 
				+        }
			
 
				+        // Since the addition of a single item exceeded the token limit, halve it.
			
 
				+        estimatedTokens = Math.floor(tokenCount / 2);
			
 
				+        currentResults.push(data);
			
 
				+      } else {
			
 
				+        currentResults.push(data);
			
 
				+        estimatedTokens += tokenCount;
			
 
				+      }
			
 
				+    }
			
 
				+
			
 
				+    currentSize += getStringByteSize(contentString);
			
 
				+    if (currentSize > maxBytes) {
			
 
				+      await writeBatchToFile();
			
 
				+    }
			
 
				+  };
			
 
				+
			
 
				+  // Iterate over each JSON file and process its contents.
			
 
				   for (const file of jsonFiles) {
			
 
				-    const data = JSON.parse(await readFile(file, "utf-8"));
			
 
				-    results.push(data);
			
 
				+    const fileContent = await readFile(file, "utf-8");
			
 
				+    const data: Record<string, any> = JSON.parse(fileContent);
			
 
				+    await addContentOrSplit(data);
			
 
				   }
			
 
				 
			
 
				-  await writeFile(config.outputFileName, JSON.stringify(results, null, 2));
			
 
				+  // Check if any remaining data needs to be written to a file.
			
 
				+  if (currentResults.length > 0) {
			
 
				+    await writeBatchToFile();
			
 
				+  }
			
 
				 }