|
@@ -4,9 +4,7 @@ import { readFile, writeFile } from "fs/promises";
|
|
|
import { glob } from "glob";
|
|
|
import { Config, configSchema } from "./config.js";
|
|
|
import { Page } from "playwright";
|
|
|
-import {
|
|
|
- isWithinTokenLimit,
|
|
|
-} from 'gpt-tokenizer'
|
|
|
+import { isWithinTokenLimit } from "gpt-tokenizer";
|
|
|
|
|
|
let pageCounter = 0;
|
|
|
|
|
@@ -144,20 +142,26 @@ export async function crawl(config: Config) {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
-export async function write(config: Config) {
|
|
|
- const jsonFiles = await glob("storage/datasets/default/*.json", { absolute: true });
|
|
|
+export async function write(config: Config) {
|
|
|
+ const jsonFiles = await glob("storage/datasets/default/*.json", {
|
|
|
+ absolute: true,
|
|
|
+ });
|
|
|
|
|
|
console.log(`Found ${jsonFiles.length} files to combine...`);
|
|
|
|
|
|
let currentResults: Record<string, any>[] = [];
|
|
|
let currentSize: number = 0;
|
|
|
let fileCounter: number = 1;
|
|
|
- const maxBytes: number = config.maxFileSize ? config.maxFileSize * 1024 * 1024 : Infinity;
|
|
|
-
|
|
|
- const getStringByteSize = (str: string): number => Buffer.byteLength(str, 'utf-8');
|
|
|
-
|
|
|
- const nextFileName = (): string => `${config.outputFileName.replace(/\.json$/, '')}-${fileCounter}.json`;
|
|
|
-
|
|
|
+ const maxBytes: number = config.maxFileSize
|
|
|
+ ? config.maxFileSize * 1024 * 1024
|
|
|
+ : Infinity;
|
|
|
+
|
|
|
+ const getStringByteSize = (str: string): number =>
|
|
|
+ Buffer.byteLength(str, "utf-8");
|
|
|
+
|
|
|
+ const nextFileName = (): string =>
|
|
|
+ `${config.outputFileName.replace(/\.json$/, "")}-${fileCounter}.json`;
|
|
|
+
|
|
|
const writeBatchToFile = async (): Promise<void> => {
|
|
|
await writeFile(nextFileName(), JSON.stringify(currentResults, null, 2));
|
|
|
console.log(`Wrote ${currentResults.length} items to ${nextFileName()}`);
|
|
@@ -165,14 +169,19 @@ export async function write(config: Config) {
|
|
|
currentSize = 0;
|
|
|
fileCounter++;
|
|
|
};
|
|
|
-
|
|
|
+
|
|
|
let estimatedTokens: number = 0;
|
|
|
|
|
|
- const addContentOrSplit = async (data: Record<string, any>): Promise<void> => {
|
|
|
+ const addContentOrSplit = async (
|
|
|
+ data: Record<string, any>,
|
|
|
+ ): Promise<void> => {
|
|
|
const contentString: string = JSON.stringify(data);
|
|
|
- const tokenCount: number | false = isWithinTokenLimit(contentString, config.maxTokens || Infinity);
|
|
|
+ const tokenCount: number | false = isWithinTokenLimit(
|
|
|
+ contentString,
|
|
|
+ config.maxTokens || Infinity,
|
|
|
+ );
|
|
|
|
|
|
- if (typeof tokenCount === 'number') {
|
|
|
+ if (typeof tokenCount === "number") {
|
|
|
if (estimatedTokens + tokenCount > config.maxTokens!) {
|
|
|
// Only write the batch if it's not empty (something to write)
|
|
|
if (currentResults.length > 0) {
|
|
@@ -195,7 +204,7 @@ export async function write(config: Config) {
|
|
|
|
|
|
// Iterate over each JSON file and process its contents.
|
|
|
for (const file of jsonFiles) {
|
|
|
- const fileContent = await readFile(file, 'utf-8');
|
|
|
+ const fileContent = await readFile(file, "utf-8");
|
|
|
const data: Record<string, any> = JSON.parse(fileContent);
|
|
|
await addContentOrSplit(data);
|
|
|
}
|
|
@@ -204,4 +213,4 @@ export async function write(config: Config) {
|
|
|
if (currentResults.length > 0) {
|
|
|
await writeBatchToFile();
|
|
|
}
|
|
|
-};
|
|
|
+}
|