Skip to content

Commit

Permalink
chore: improve delimiter (langgenius#8552)
Browse files Browse the repository at this point in the history
  • Loading branch information
iamjoel authored Sep 19, 2024
1 parent d96f5ba commit 7411bcf
Show file tree
Hide file tree
Showing 5 changed files with 103 additions and 12 deletions.
18 changes: 18 additions & 0 deletions web/app/components/datasets/create/step-two/escape.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
function escape(input: string): string {
if (!input || typeof input !== 'string')
return ''

const res = input
.replaceAll('\\', '\\\\')
.replaceAll('\0', '\\0')
.replaceAll('\b', '\\b')
.replaceAll('\f', '\\f')
.replaceAll('\n', '\\n')
.replaceAll('\r', '\\r')
.replaceAll('\t', '\\t')
.replaceAll('\v', '\\v')
.replaceAll('\'', '\\\'')
return res
}

export default escape
37 changes: 27 additions & 10 deletions web/app/components/datasets/create/step-two/index.tsx
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
'use client'
import React, { useEffect, useLayoutEffect, useRef, useState } from 'react'
import React, { useCallback, useEffect, useLayoutEffect, useRef, useState } from 'react'
import { useTranslation } from 'react-i18next'
import { useContext } from 'use-context-selector'
import { useBoolean } from 'ahooks'
Expand All @@ -13,6 +13,8 @@ import { groupBy } from 'lodash-es'
import PreviewItem, { PreviewType } from './preview-item'
import LanguageSelect from './language-select'
import s from './index.module.css'
import unescape from './unescape'
import escape from './escape'
import cn from '@/utils/classnames'
import type { CrawlOptions, CrawlResultItem, CreateDocumentReq, CustomFile, FileIndexingEstimateResponse, FullDocumentDetail, IndexingEstimateParams, NotionInfo, PreProcessingRule, ProcessRule, Rules, createDocumentResponse } from '@/models/datasets'
import {
Expand Down Expand Up @@ -78,6 +80,8 @@ enum IndexingType {
ECONOMICAL = 'economy',
}

const DEFAULT_SEGMENT_IDENTIFIER = '\\n\\n'

const StepTwo = ({
isSetting,
documentDetail,
Expand Down Expand Up @@ -110,8 +114,11 @@ const StepTwo = ({
const previewScrollRef = useRef<HTMLDivElement>(null)
const [previewScrolled, setPreviewScrolled] = useState(false)
const [segmentationType, setSegmentationType] = useState<SegmentType>(SegmentType.AUTO)
const [segmentIdentifier, setSegmentIdentifier] = useState('\\n')
const [max, setMax] = useState(5000) // default chunk length
const [segmentIdentifier, doSetSegmentIdentifier] = useState(DEFAULT_SEGMENT_IDENTIFIER)
const setSegmentIdentifier = useCallback((value: string) => {
doSetSegmentIdentifier(value ? escape(value) : DEFAULT_SEGMENT_IDENTIFIER)
}, [])
const [max, setMax] = useState(4000) // default chunk length
const [overlap, setOverlap] = useState(50)
const [rules, setRules] = useState<PreProcessingRule[]>([])
const [defaultConfig, setDefaultConfig] = useState<Rules>()
Expand Down Expand Up @@ -183,7 +190,7 @@ const StepTwo = ({
}
const resetRules = () => {
if (defaultConfig) {
setSegmentIdentifier((defaultConfig.segmentation.separator === '\n' ? '\\n' : defaultConfig.segmentation.separator) || '\\n')
setSegmentIdentifier(defaultConfig.segmentation.separator)
setMax(defaultConfig.segmentation.max_tokens)
setOverlap(defaultConfig.segmentation.chunk_overlap)
setRules(defaultConfig.pre_processing_rules)
Expand Down Expand Up @@ -217,7 +224,7 @@ const StepTwo = ({
const ruleObj = {
pre_processing_rules: rules,
segmentation: {
separator: segmentIdentifier === '\\n' ? '\n' : segmentIdentifier,
separator: unescape(segmentIdentifier),
max_tokens: max,
chunk_overlap: overlap,
},
Expand Down Expand Up @@ -394,7 +401,7 @@ const StepTwo = ({
try {
const res = await fetchDefaultProcessRule({ url: '/datasets/process-rule' })
const separator = res.rules.segmentation.separator
setSegmentIdentifier((separator === '\n' ? '\\n' : separator) || '\\n')
setSegmentIdentifier(separator)
setMax(res.rules.segmentation.max_tokens)
setOverlap(res.rules.segmentation.chunk_overlap)
setRules(res.rules.pre_processing_rules)
Expand All @@ -411,7 +418,7 @@ const StepTwo = ({
const separator = rules.segmentation.separator
const max = rules.segmentation.max_tokens
const overlap = rules.segmentation.chunk_overlap
setSegmentIdentifier((separator === '\n' ? '\\n' : separator) || '\\n')
setSegmentIdentifier(separator)
setMax(max)
setOverlap(overlap)
setRules(rules.pre_processing_rules)
Expand Down Expand Up @@ -616,12 +623,22 @@ const StepTwo = ({
<div className={s.typeFormBody}>
<div className={s.formRow}>
<div className='w-full'>
<div className={s.label}>{t('datasetCreation.stepTwo.separator')}</div>
<div className={s.label}>
{t('datasetCreation.stepTwo.separator')}
<Tooltip
popupContent={
<div className='max-w-[200px]'>
{t('datasetCreation.stepTwo.separatorTip')}
</div>
}
/>
</div>
<input
type="text"
className={s.input}
placeholder={t('datasetCreation.stepTwo.separatorPlaceholder') || ''} value={segmentIdentifier}
onChange={e => setSegmentIdentifier(e.target.value)}
placeholder={t('datasetCreation.stepTwo.separatorPlaceholder') || ''}
value={segmentIdentifier}
onChange={e => doSetSegmentIdentifier(e.target.value)}
/>
</div>
</div>
Expand Down
54 changes: 54 additions & 0 deletions web/app/components/datasets/create/step-two/unescape.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
// https://github.com/iamakulov/unescape-js/blob/master/src/index.js

/**
* \\ - matches the backslash which indicates the beginning of an escape sequence
* (
* u\{([0-9A-Fa-f]+)\} - first alternative; matches the variable-length hexadecimal escape sequence (\u{ABCD0})
* |
* u([0-9A-Fa-f]{4}) - second alternative; matches the 4-digit hexadecimal escape sequence (\uABCD)
* |
* x([0-9A-Fa-f]{2}) - third alternative; matches the 2-digit hexadecimal escape sequence (\xA5)
* |
* ([1-7][0-7]{0,2}|[0-7]{2,3}) - fourth alternative; matches the up-to-3-digit octal escape sequence (\5 or \512)
* |
* (['"tbrnfv0\\]) - fifth alternative; matches the special escape characters (\t, \n and so on)
* |
* \U([0-9A-Fa-f]+) - sixth alternative; matches the 8-digit hexadecimal escape sequence used by python (\U0001F3B5)
* )
*/
const jsEscapeRegex = /\\(u\{([0-9A-Fa-f]+)\}|u([0-9A-Fa-f]{4})|x([0-9A-Fa-f]{2})|([1-7][0-7]{0,2}|[0-7]{2,3})|(['"tbrnfv0\\]))|\\U([0-9A-Fa-f]{8})/g

const usualEscapeSequences: Record<string, string> = {
'0': '\0',
'b': '\b',
'f': '\f',
'n': '\n',
'r': '\r',
't': '\t',
'v': '\v',
'\'': '\'',
'"': '"',
'\\': '\\',
}

const fromHex = (str: string) => String.fromCodePoint(parseInt(str, 16))
const fromOct = (str: string) => String.fromCodePoint(parseInt(str, 8))

const unescape = (str: string) => {
return str.replace(jsEscapeRegex, (_, __, varHex, longHex, shortHex, octal, specialCharacter, python) => {
if (varHex !== undefined)
return fromHex(varHex)
else if (longHex !== undefined)
return fromHex(longHex)
else if (shortHex !== undefined)
return fromHex(shortHex)
else if (octal !== undefined)
return fromOct(octal)
else if (python !== undefined)
return fromHex(python)
else
return usualEscapeSequences[specialCharacter]
})
}

export default unescape
3 changes: 2 additions & 1 deletion web/i18n/en-US/dataset-creation.ts
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,8 @@ const translation = {
custom: 'Custom',
customDescription: 'Customize chunks rules, chunks length, and preprocessing rules, etc.',
separator: 'Delimiter',
separatorPlaceholder: 'For example, newline (\\\\n) or special separator (such as "***")',
separatorTip: 'A delimiter is the character used to separate text. \\n\\n and \\n are commonly used delimiters for separating paragraphs and lines. Combined with commas (\\n\\n,\\n), paragraphs will be segmented by lines when exceeding the maximum chunk length. You can also use special delimiters defined by yourself (e.g. ***).',
separatorPlaceholder: '\\n\\n for separating paragraphs; \\n for separating lines',
maxLength: 'Maximum chunk length',
overlap: 'Chunk overlap',
overlapTip: 'Setting the chunk overlap can maintain the semantic relevance between them, enhancing the retrieve effect. It is recommended to set 10%-25% of the maximum chunk size.',
Expand Down
3 changes: 2 additions & 1 deletion web/i18n/zh-Hans/dataset-creation.ts
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,8 @@ const translation = {
custom: '自定义',
customDescription: '自定义分段规则、分段长度以及预处理规则等参数',
separator: '分段标识符',
separatorPlaceholder: '例如换行符(\n)或特定的分隔符(如 "***")',
separatorTip: '分隔符是用于分隔文本的字符。\\n\\n 和 \\n 是常用于分隔段落和行的分隔符。用逗号连接分隔符(\\n\\n,\\n),当段落超过最大块长度时,会按行进行分割。你也可以使用自定义的特殊分隔符(例如 ***)。',
separatorPlaceholder: '\\n\\n 用于分段;\\n 用于分行',
maxLength: '分段最大长度',
overlap: '分段重叠长度',
overlapTip: '设置分段之间的重叠长度可以保留分段之间的语义关系,提升召回效果。建议设置为最大分段长度的10%-25%',
Expand Down

0 comments on commit 7411bcf

Please sign in to comment.