V4.6.6-1 (labring#656)

lbj2001 · Dec 27, 2023 · 759a233 · 759a233
1 parent 86286ef
commit 759a233
Show file tree

Hide file tree

Showing 182 changed files with 3,084 additions and 81,670 deletions.
diff --git a/docSite/content/docs/development/configuration.md b/docSite/content/docs/development/configuration.md
@@ -18,15 +18,17 @@ weight: 708
 
 **使用时，请务必去除注释！**
 
+以下配置适用于V4.6.6-alpha版本以后
+
 ```json
 {
-  "SystemParams": {
+  "systemEnv": {
     "pluginBaseUrl": "", // 商业版接口地址
     "vectorMaxProcess": 15, // 向量生成最大进程，结合数据库性能和 key 来设置
     "qaMaxProcess": 15,  // QA 生成最大进程，结合数据库性能和 key 来设置
     "pgHNSWEfSearch": 100  // pg vector 索引参数，越大精度高但速度慢
   },
-  "ChatModels": [ // 对话模型
+  "chatModels": [ // 对话模型
     {
       "model": "gpt-3.5-turbo-1106",
       "name": "GPT35-1106",
@@ -76,7 +78,7 @@ weight: 708
       "defaultSystemChatPrompt": ""
     }
   ],
-  "QAModels": [ // QA 生成模型
+  "qaModels": [ // QA 生成模型
     {
       "model": "gpt-3.5-turbo-16k",
       "name": "GPT35-16k",
@@ -85,7 +87,7 @@ weight: 708
       "price": 0
     }
   ],
-  "CQModels": [ // 问题分类模型
+  "cqModels": [ // 问题分类模型
     {
       "model": "gpt-3.5-turbo-1106",
       "name": "GPT35-1106",
@@ -105,7 +107,7 @@ weight: 708
       "functionPrompt": ""
     }
   ],
-  "ExtractModels": [ // 内容提取模型
+  "extractModels": [ // 内容提取模型
     {
       "model": "gpt-3.5-turbo-1106",
       "name": "GPT35-1106",
@@ -116,7 +118,7 @@ weight: 708
       "functionPrompt": ""
     }
   ],
-  "QGModels": [ // 生成下一步指引
+  "qgModels": [ // 生成下一步指引
     {
       "model": "gpt-3.5-turbo-1106",
       "name": "GPT35-1106",
@@ -125,7 +127,7 @@ weight: 708
       "price": 0
     }
   ],
-  "VectorModels": [ // 向量模型
+  "vectorModels": [ // 向量模型
     {
       "model": "text-embedding-ada-002",
       "name": "Embedding-2",
@@ -134,8 +136,8 @@ weight: 708
       "maxToken": 3000
     }
   ],
-  "ReRankModels": [], // 重排模型,暂时填空数组
-  "AudioSpeechModels": [
+  "reRankModels": [], // 重排模型,暂时填空数组
+  "audioSpeechModels": [
     {
       "model": "tts-1",
       "name": "OpenAI TTS1",
@@ -152,7 +154,7 @@ weight: 708
       ]
     }
   ],
-  "WhisperModel": {
+  "whisperModel": {
     "model": "whisper-1",
     "name": "Whisper1",
     "price": 0

diff --git a/docSite/content/docs/development/upgrading/465.md b/docSite/content/docs/development/upgrading/465.md
@@ -9,7 +9,7 @@ weight: 831
 
 ## 配置文件变更
 
-由于 openai 已开始启用 function call，改为 toolChoice。FastGPT 同步的修改了对于的配置和调用方式，需要对配置文件做一些修改：
+由于 openai 已开始弃用 function call，改为 toolChoice。FastGPT 同步的修改了对于的配置和调用方式，需要对配置文件做一些修改：
 
 [点击查看最新的配置文件](/docs/development/configuration/)
 

diff --git a/docSite/content/docs/development/upgrading/466.md b/docSite/content/docs/development/upgrading/466.md
@@ -0,0 +1,22 @@
+---
+title: 'V4.6.6（需要改配置文件）'
+description: 'FastGPT V4.6.6'
+icon: 'upgrade'
+draft: false
+toc: true
+weight: 831
+---
+
+**版本仍在开发中……**
+
+## 配置文件变更
+
+为了减少代码重复度，我们对配置文件做了一些修改：[点击查看最新的配置文件](/docs/development/configuration/)
+
+
+
+## V4.6.6 即将更新
+
+1. UI 优化，未来将逐步替换新的UI设计。
+
+
diff --git a/package.json b/package.json
@@ -6,16 +6,18 @@
     "prepare": "husky install",
     "format-code": "prettier --config \"./.prettierrc.js\" --write \"./**/src/**/*.{ts,tsx,scss}\"",
     "format-doc": "zhlint --dir ./docSite *.md --fix",
+    "gen:theme-typings": "chakra-cli tokens projects/app/src/web/styles/theme.ts --out node_modules/.pnpm/node_modules/@chakra-ui/styled-system/dist/theming.types.d.ts",
     "postinstall": "sh ./scripts/postinstall.sh"
   },
   "devDependencies": {
+    "@chakra-ui/cli": "^2.4.1",
     "husky": "^8.0.3",
-    "lint-staged": "^13.2.1",
-    "prettier": "^3.0.3",
-    "zhlint": "^0.7.1",
     "i18next": "^22.5.1",
+    "lint-staged": "^13.2.1",
     "next-i18next": "^13.3.0",
-    "react-i18next": "^12.3.1"
+    "prettier": "^3.0.3",
+    "react-i18next": "^12.3.1",
+    "zhlint": "^0.7.1"
   },
   "lint-staged": {
     "./**/**/*.{ts,tsx,scss}": "npm run format-code",

diff --git a/packages/global/common/file/read/index.ts b/packages/global/common/file/read/index.ts
@@ -0,0 +1,62 @@
+/* read file to txt */
+import * as pdfjsLib from 'pdfjs-dist';
+
+export const readPdfFile = async ({ pdf }: { pdf: string | URL | ArrayBuffer }) => {
+  pdfjsLib.GlobalWorkerOptions.workerSrc = '/js/pdf.worker.js';
+
+  type TokenType = {
+    str: string;
+    dir: string;
+    width: number;
+    height: number;
+    transform: number[];
+    fontName: string;
+    hasEOL: boolean;
+  };
+
+  const readPDFPage = async (doc: any, pageNo: number) => {
+    const page = await doc.getPage(pageNo);
+    const tokenizedText = await page.getTextContent();
+
+    const viewport = page.getViewport({ scale: 1 });
+    const pageHeight = viewport.height;
+    const headerThreshold = pageHeight * 0.07; // 假设页头在页面顶部5%的区域内
+    const footerThreshold = pageHeight * 0.93; // 假设页脚在页面底部5%的区域内
+
+    const pageTexts: TokenType[] = tokenizedText.items.filter((token: TokenType) => {
+      return (
+        !token.transform ||
+        (token.transform[5] > headerThreshold && token.transform[5] < footerThreshold)
+      );
+    });
+
+    // concat empty string 'hasEOL'
+    for (let i = 0; i < pageTexts.length; i++) {
+      const item = pageTexts[i];
+      if (item.str === '' && pageTexts[i - 1]) {
+        pageTexts[i - 1].hasEOL = item.hasEOL;
+        pageTexts.splice(i, 1);
+        i--;
+      }
+    }
+
+    page.cleanup();
+
+    return pageTexts
+      .map((token) => {
+        const paragraphEnd = token.hasEOL && /([。？！.?!\n\r]|(\r\n))$/.test(token.str);
+
+        return paragraphEnd ? `${token.str}\n` : token.str;
+      })
+      .join('');
+  };
+
+  const doc = await pdfjsLib.getDocument(pdf).promise;
+  const pageTextPromises = [];
+  for (let pageNo = 1; pageNo <= doc.numPages; pageNo++) {
+    pageTextPromises.push(readPDFPage(doc, pageNo));
+  }
+  const pageTexts = await Promise.all(pageTextPromises);
+
+  return pageTexts.join('');
+};
diff --git a/packages/global/common/string/markdown.ts b/packages/global/common/string/markdown.ts
@@ -34,3 +34,41 @@ export const simpleMarkdownText = (rawText: string) => {
 
   return rawText.trim();
 };
+
+/**
+ * format markdown
+ * 1. upload base64
+ * 2. replace \
+ */
+export const uploadMarkdownBase64 = async ({
+  rawText,
+  uploadImgController
+}: {
+  rawText: string;
+  uploadImgController: (base64: string) => Promise<string>;
+}) => {
+  // match base64, upload and replace it
+  const base64Regex = /data:image\/.*;base64,([^\)]+)/g;
+  const base64Arr = rawText.match(base64Regex) || [];
+  // upload base64 and replace it
+  await Promise.all(
+    base64Arr.map(async (base64Img) => {
+      try {
+        const str = await uploadImgController(base64Img);
+
+        rawText = rawText.replace(base64Img, str);
+      } catch (error) {
+        rawText = rawText.replace(base64Img, '');
+        rawText = rawText.replace(/!\[.*\]\(\)/g, '');
+      }
+    })
+  );
+
+  // Remove white space on both sides of the picture
+  const trimReg = /(!\[.*\]\(.*\))\s*/g;
+  if (trimReg.test(rawText)) {
+    rawText = rawText.replace(trimReg, '$1');
+  }
+
+  return simpleMarkdownText(rawText);
+};
diff --git a/packages/global/common/string/textSplitter.ts b/packages/global/common/string/textSplitter.ts
@@ -31,7 +31,7 @@ export const splitText2Chunks = (props: {
 
   // The larger maxLen is, the next sentence is less likely to trigger splitting
   const stepReges: { reg: RegExp; maxLen: number }[] = [
-    ...customReg.map((text) => ({ reg: new RegExp(`([${text}])`, 'g'), maxLen: chunkLen * 1.4 })),
+    ...customReg.map((text) => ({ reg: new RegExp(`(${text})`, 'g'), maxLen: chunkLen * 1.4 })),
     { reg: /^(#\s[^\n]+)\n/gm, maxLen: chunkLen * 1.2 },
     { reg: /^(##\s[^\n]+)\n/gm, maxLen: chunkLen * 1.2 },
     { reg: /^(###\s[^\n]+)\n/gm, maxLen: chunkLen * 1.2 },
@@ -64,13 +64,22 @@ export const splitText2Chunks = (props: {
         }
       ];
     }
+
+    const isCustomSteep = checkIsCustomStep(step);
     const isMarkdownSplit = checkIsMarkdownSplit(step);
     const independentChunk = checkIndependentChunk(step);
 
     const { reg } = stepReges[step];
 
     const splitTexts = text
-      .replace(reg, independentChunk ? `${splitMarker}$1` : `$1${splitMarker}`)
+      .replace(
+        reg,
+        (() => {
+          if (isCustomSteep) return splitMarker;
+          if (independentChunk) return `${splitMarker}$1`;
+          return `$1${splitMarker}`;
+        })()
+      )
       .split(`${splitMarker}`)
       .filter((part) => part.trim());
 
@@ -128,11 +137,6 @@ export const splitText2Chunks = (props: {
     const independentChunk = checkIndependentChunk(step);
     const isCustomStep = checkIsCustomStep(step);
 
-    // mini text
-    if (text.length <= chunkLen) {
-      return [text];
-    }
-
     // oversize
     if (step >= stepReges.length) {
       if (text.length < chunkLen * 3) {
@@ -221,6 +225,8 @@ export const splitText2Chunks = (props: {
       } else {
         chunks.push(`${mdTitle}${lastText}`);
       }
+    } else if (lastText && chunks.length === 0) {
+      chunks.push(lastText);
     }
 
     return chunks;

diff --git a/packages/global/common/system/types/index.d.ts b/packages/global/common/system/types/index.d.ts
@@ -1,4 +1,29 @@
-export type FeConfigsType = {
+import type {
+  ChatModelItemType,
+  FunctionModelItemType,
+  LLMModelItemType,
+  VectorModelItemType,
+  AudioSpeechModels,
+  WhisperModelType,
+  ReRankModelItemType
+} from '../../../core/ai/model.d';
+
+/* fastgpt main */
+export type FastGPTConfigFileType = {
+  feConfigs: FastGPTFeConfigsType;
+  systemEnv: SystemEnvType;
+  chatModels: ChatModelItemType[];
+  qaModels: LLMModelItemType[];
+  cqModels: FunctionModelItemType[];
+  extractModels: FunctionModelItemType[];
+  qgModels: LLMModelItemType[];
+  vectorModels: VectorModelItemType[];
+  reRankModels: ReRankModelItemType[];
+  audioSpeechModels: AudioSpeechModelType[];
+  whisperModel: WhisperModelType;
+};
+
+export type FastGPTFeConfigsType = {
   show_emptyChat?: boolean;
   show_register?: boolean;
   show_appStore?: boolean;
@@ -34,6 +59,6 @@ export type SystemEnvType = {
 };
 
 declare global {
-  var feConfigs: FeConfigsType;
+  var feConfigs: FastGPTFeConfigsType;
   var systemEnv: SystemEnvType;
 }
diff --git a/packages/global/core/ai/model.d.ts b/packages/global/core/ai/model.d.ts
@@ -24,6 +24,7 @@ export type VectorModelItemType = {
   defaultToken: number;
   price: number;
   maxToken: number;
+  weight: number;
 };
 
 export type ReRankModelItemType = {

diff --git a/packages/global/core/ai/model.ts b/packages/global/core/ai/model.ts
@@ -16,6 +16,7 @@ export const defaultVectorModels: VectorModelItemType[] = [
     name: 'Embedding-2',
     price: 0,
     defaultToken: 500,
-    maxToken: 3000
+    maxToken: 3000,
+    weight: 100
   }
 ];
diff --git a/packages/global/core/dataset/type.d.ts b/packages/global/core/dataset/type.d.ts
@@ -89,6 +89,7 @@ export type DatasetTrainingSchemaType = {
   q: string;
   a: string;
   chunkIndex: number;
+  weight: number;
   indexes: Omit<DatasetDataIndexItemType, 'dataId'>[];
 };
 

diff --git a/packages/global/core/module/template/system/contextExtract.ts b/packages/global/core/module/template/system/contextExtract.ts
@@ -36,10 +36,11 @@ export const ContextExtractModule: FlowModuleTemplateType = {
       type: FlowNodeInputTypeEnum.textarea,
       valueType: ModuleIOValueTypeEnum.string,
       label: '提取要求描述',
-      description: '给AI一些对应的背景知识或要求描述，引导AI更好的完成任务',
+      description:
+        '给AI一些对应的背景知识或要求描述，引导AI更好的完成任务。\n该输入框可使用全局变量。',
       required: true,
       placeholder:
-        '例如: \n1. 你是一个实验室预约助手，你的任务是帮助用户预约实验室。\n2. 你是谷歌搜索助手，需要从文本中提取出合适的搜索词。',
+        '例如: \n1. 当前时间为: {{cTime}}。你是一个实验室预约助手，你的任务是帮助用户预约实验室，从文本中获取对应的预约信息。\n2. 你是谷歌搜索助手，需要从文本中提取出合适的搜索词。',
       showTargetInApp: true,
       showTargetInPlugin: true
     },

diff --git a/packages/global/package.json b/packages/global/package.json
@@ -2,11 +2,12 @@
   "name": "@fastgpt/global",
   "version": "1.0.0",
   "dependencies": {
+    "axios": "^1.5.1",
     "dayjs": "^1.11.7",
-    "openai": "4.23.0",
     "encoding": "^0.1.13",
     "js-tiktoken": "^1.0.7",
-    "axios": "^1.5.1",
+    "openai": "4.23.0",
+    "pdfjs-dist": "^4.0.269",
     "timezones-list": "^3.0.2"
   },
   "devDependencies": {