diff --git a/web/app/components/datasets/create/step-two/escape.ts b/web/app/components/datasets/create/step-two/escape.ts new file mode 100644 index 00000000000000..098f43bc7f3d0c --- /dev/null +++ b/web/app/components/datasets/create/step-two/escape.ts @@ -0,0 +1,18 @@ +function escape(input: string): string { + if (!input || typeof input !== 'string') + return '' + + const res = input + .replaceAll('\\', '\\\\') + .replaceAll('\0', '\\0') + .replaceAll('\b', '\\b') + .replaceAll('\f', '\\f') + .replaceAll('\n', '\\n') + .replaceAll('\r', '\\r') + .replaceAll('\t', '\\t') + .replaceAll('\v', '\\v') + .replaceAll('\'', '\\\'') + return res +} + +export default escape diff --git a/web/app/components/datasets/create/step-two/index.tsx b/web/app/components/datasets/create/step-two/index.tsx index 15332b944deb50..94614918dbe404 100644 --- a/web/app/components/datasets/create/step-two/index.tsx +++ b/web/app/components/datasets/create/step-two/index.tsx @@ -1,5 +1,5 @@ 'use client' -import React, { useEffect, useLayoutEffect, useRef, useState } from 'react' +import React, { useCallback, useEffect, useLayoutEffect, useRef, useState } from 'react' import { useTranslation } from 'react-i18next' import { useContext } from 'use-context-selector' import { useBoolean } from 'ahooks' @@ -13,6 +13,8 @@ import { groupBy } from 'lodash-es' import PreviewItem, { PreviewType } from './preview-item' import LanguageSelect from './language-select' import s from './index.module.css' +import unescape from './unescape' +import escape from './escape' import cn from '@/utils/classnames' import type { CrawlOptions, CrawlResultItem, CreateDocumentReq, CustomFile, FileIndexingEstimateResponse, FullDocumentDetail, IndexingEstimateParams, NotionInfo, PreProcessingRule, ProcessRule, Rules, createDocumentResponse } from '@/models/datasets' import { @@ -78,6 +80,8 @@ enum IndexingType { ECONOMICAL = 'economy', } +const DEFAULT_SEGMENT_IDENTIFIER = '\\n\\n' + const StepTwo = ({ isSetting, documentDetail, @@ -110,8 +114,11 @@ const StepTwo = ({ const previewScrollRef = useRef(null) const [previewScrolled, setPreviewScrolled] = useState(false) const [segmentationType, setSegmentationType] = useState(SegmentType.AUTO) - const [segmentIdentifier, setSegmentIdentifier] = useState('\\n') - const [max, setMax] = useState(5000) // default chunk length + const [segmentIdentifier, doSetSegmentIdentifier] = useState(DEFAULT_SEGMENT_IDENTIFIER) + const setSegmentIdentifier = useCallback((value: string) => { + doSetSegmentIdentifier(value ? escape(value) : DEFAULT_SEGMENT_IDENTIFIER) + }, []) + const [max, setMax] = useState(4000) // default chunk length const [overlap, setOverlap] = useState(50) const [rules, setRules] = useState([]) const [defaultConfig, setDefaultConfig] = useState() @@ -183,7 +190,7 @@ const StepTwo = ({ } const resetRules = () => { if (defaultConfig) { - setSegmentIdentifier((defaultConfig.segmentation.separator === '\n' ? '\\n' : defaultConfig.segmentation.separator) || '\\n') + setSegmentIdentifier(defaultConfig.segmentation.separator) setMax(defaultConfig.segmentation.max_tokens) setOverlap(defaultConfig.segmentation.chunk_overlap) setRules(defaultConfig.pre_processing_rules) @@ -217,7 +224,7 @@ const StepTwo = ({ const ruleObj = { pre_processing_rules: rules, segmentation: { - separator: segmentIdentifier === '\\n' ? '\n' : segmentIdentifier, + separator: unescape(segmentIdentifier), max_tokens: max, chunk_overlap: overlap, }, @@ -394,7 +401,7 @@ const StepTwo = ({ try { const res = await fetchDefaultProcessRule({ url: '/datasets/process-rule' }) const separator = res.rules.segmentation.separator - setSegmentIdentifier((separator === '\n' ? '\\n' : separator) || '\\n') + setSegmentIdentifier(separator) setMax(res.rules.segmentation.max_tokens) setOverlap(res.rules.segmentation.chunk_overlap) setRules(res.rules.pre_processing_rules) @@ -411,7 +418,7 @@ const StepTwo = ({ const separator = rules.segmentation.separator const max = rules.segmentation.max_tokens const overlap = rules.segmentation.chunk_overlap - setSegmentIdentifier((separator === '\n' ? '\\n' : separator) || '\\n') + setSegmentIdentifier(separator) setMax(max) setOverlap(overlap) setRules(rules.pre_processing_rules) @@ -616,12 +623,22 @@ const StepTwo = ({
-
{t('datasetCreation.stepTwo.separator')}
+
+ {t('datasetCreation.stepTwo.separator')} + + {t('datasetCreation.stepTwo.separatorTip')} +
+ } + /> +
setSegmentIdentifier(e.target.value)} + placeholder={t('datasetCreation.stepTwo.separatorPlaceholder') || ''} + value={segmentIdentifier} + onChange={e => doSetSegmentIdentifier(e.target.value)} />
diff --git a/web/app/components/datasets/create/step-two/unescape.ts b/web/app/components/datasets/create/step-two/unescape.ts new file mode 100644 index 00000000000000..5c0f9e426a2332 --- /dev/null +++ b/web/app/components/datasets/create/step-two/unescape.ts @@ -0,0 +1,54 @@ +// https://github.com/iamakulov/unescape-js/blob/master/src/index.js + +/** + * \\ - matches the backslash which indicates the beginning of an escape sequence + * ( + * u\{([0-9A-Fa-f]+)\} - first alternative; matches the variable-length hexadecimal escape sequence (\u{ABCD0}) + * | + * u([0-9A-Fa-f]{4}) - second alternative; matches the 4-digit hexadecimal escape sequence (\uABCD) + * | + * x([0-9A-Fa-f]{2}) - third alternative; matches the 2-digit hexadecimal escape sequence (\xA5) + * | + * ([1-7][0-7]{0,2}|[0-7]{2,3}) - fourth alternative; matches the up-to-3-digit octal escape sequence (\5 or \512) + * | + * (['"tbrnfv0\\]) - fifth alternative; matches the special escape characters (\t, \n and so on) + * | + * \U([0-9A-Fa-f]+) - sixth alternative; matches the 8-digit hexadecimal escape sequence used by python (\U0001F3B5) + * ) + */ +const jsEscapeRegex = /\\(u\{([0-9A-Fa-f]+)\}|u([0-9A-Fa-f]{4})|x([0-9A-Fa-f]{2})|([1-7][0-7]{0,2}|[0-7]{2,3})|(['"tbrnfv0\\]))|\\U([0-9A-Fa-f]{8})/g + +const usualEscapeSequences: Record = { + '0': '\0', + 'b': '\b', + 'f': '\f', + 'n': '\n', + 'r': '\r', + 't': '\t', + 'v': '\v', + '\'': '\'', + '"': '"', + '\\': '\\', +} + +const fromHex = (str: string) => String.fromCodePoint(parseInt(str, 16)) +const fromOct = (str: string) => String.fromCodePoint(parseInt(str, 8)) + +const unescape = (str: string) => { + return str.replace(jsEscapeRegex, (_, __, varHex, longHex, shortHex, octal, specialCharacter, python) => { + if (varHex !== undefined) + return fromHex(varHex) + else if (longHex !== undefined) + return fromHex(longHex) + else if (shortHex !== undefined) + return fromHex(shortHex) + else if (octal !== undefined) + return fromOct(octal) + else if (python !== undefined) + return fromHex(python) + else + return usualEscapeSequences[specialCharacter] + }) +} + +export default unescape diff --git a/web/i18n/en-US/dataset-creation.ts b/web/i18n/en-US/dataset-creation.ts index 40463593f91e9b..32f9d596ca6ac9 100644 --- a/web/i18n/en-US/dataset-creation.ts +++ b/web/i18n/en-US/dataset-creation.ts @@ -87,7 +87,8 @@ const translation = { custom: 'Custom', customDescription: 'Customize chunks rules, chunks length, and preprocessing rules, etc.', separator: 'Delimiter', - separatorPlaceholder: 'For example, newline (\\\\n) or special separator (such as "***")', + separatorTip: 'A delimiter is the character used to separate text. \\n\\n and \\n are commonly used delimiters for separating paragraphs and lines. Combined with commas (\\n\\n,\\n), paragraphs will be segmented by lines when exceeding the maximum chunk length. You can also use special delimiters defined by yourself (e.g. ***).', + separatorPlaceholder: '\\n\\n for separating paragraphs; \\n for separating lines', maxLength: 'Maximum chunk length', overlap: 'Chunk overlap', overlapTip: 'Setting the chunk overlap can maintain the semantic relevance between them, enhancing the retrieve effect. It is recommended to set 10%-25% of the maximum chunk size.', diff --git a/web/i18n/zh-Hans/dataset-creation.ts b/web/i18n/zh-Hans/dataset-creation.ts index 47a15921f7c52e..78f51707918773 100644 --- a/web/i18n/zh-Hans/dataset-creation.ts +++ b/web/i18n/zh-Hans/dataset-creation.ts @@ -87,7 +87,8 @@ const translation = { custom: '自定义', customDescription: '自定义分段规则、分段长度以及预处理规则等参数', separator: '分段标识符', - separatorPlaceholder: '例如换行符(\n)或特定的分隔符(如 "***")', + separatorTip: '分隔符是用于分隔文本的字符。\\n\\n 和 \\n 是常用于分隔段落和行的分隔符。用逗号连接分隔符(\\n\\n,\\n),当段落超过最大块长度时,会按行进行分割。你也可以使用自定义的特殊分隔符(例如 ***)。', + separatorPlaceholder: '\\n\\n 用于分段;\\n 用于分行', maxLength: '分段最大长度', overlap: '分段重叠长度', overlapTip: '设置分段之间的重叠长度可以保留分段之间的语义关系,提升召回效果。建议设置为最大分段长度的10%-25%',