Skip to content

Commit

Permalink
feat: union pattern base demo
Browse files Browse the repository at this point in the history
  • Loading branch information
jojocys committed Dec 8, 2022
1 parent c8dfe35 commit 6c54308
Show file tree
Hide file tree
Showing 6 changed files with 156 additions and 46 deletions.
1 change: 1 addition & 0 deletions packages/rath-client/public/locales/en-US.json
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,7 @@
"cancel": "Cancel",
"auto": "Extend Suggestions",
"suggestion": {
"regex_selection": "Pattern discovery & extraction | Extract patterns from text fields and generate new fields matching extracted pattern.",
"dateTimeExpand": "Group date time by units of time.|Time series may be more regular or correlated as the time attribution is grouped into different scales such as years and months.",
"oneHot": "One-hot encoding for categorical fields.|One-hot encoding is a way to represent categorical variables as binary vectors. It allows the representation of categorical data to be more expressive.",
"wordTFIDF": "Extract words from text fields(score: tf-idf).|TF-IDF is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus.",
Expand Down
1 change: 1 addition & 0 deletions packages/rath-client/public/locales/zh-CN.json
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,7 @@
"cancel": "取消",
"auto": "扩展建议",
"suggestion": {
"regex_selection": "模式发现&提取|将选中的字段作为模式,发现数据集中所有匹配的相似模式并提取出来。",
"dateTimeExpand": "将数据映射到不同时间单位。|时间序列可能在不同的单位尺度下展现出更强的周期性或相关性。",
"oneHot": "独热编码(one-hot encoding)|独热编码可以将类别型数据转换为多个数值型数据,方便进行相关性分析。",
"wordTFIDF": "按文档的词频与逆向文件频率提取类别(tf-idf)。|tf-idf是一种统计方法,用以评估一字词对于一个文件集或一个语料库中的其中一份文件的重要程度",
Expand Down
15 changes: 8 additions & 7 deletions packages/rath-client/src/lib/textPattern/init.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,25 +8,25 @@ import regexgen from 'regexgen';
const patterns = [
{
name: 'text',
pattern: /(?:\w+|[\u4e00-\u9fa5]+)(?:\s+|[\u4e00-\u9fa5]+)*/g
pattern: /(?!\d+$)(?:\w+|[\u4e00-\u9fa5]+)(?:\s+|[\u4e00-\u9fa5]+)*/
},
{
name: 'number',
pattern: /(?:\d+)(?:\.\d+)?/g
pattern: /(?:\d+)(?:\.\d+)?/
},
{
name: 'punctuation',
pattern: /[\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E\u00A0-\u00BF\u2000-\u206F\u3000-\u303F\uFF00-\uFFEF]+/g
pattern: /[\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E\u00A0-\u00BF\u2000-\u206F\u3000-\u303F\uFF00-\uFFEF]+/
},
{
name: 'symbol',
pattern: /[\u0021-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E\u00A0-\u00BF\u2000-\u206F\u3000-\u303F\uFF00-\uFFEF]+/g
pattern: /[\u0021-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E\u00A0-\u00BF\u2000-\u206F\u3000-\u303F\uFF00-\uFFEF]+/
}
];
// @ts-ignore
// console.log('window buffer', window.Buffer, regexgen)
export function initPatterns (textSelection: {str: string; startIndex: number; endIndex: number}[]) {
console.log(textSelection)
// console.log(textSelection)
const patternTypes = new Set<string>();
const rawPH: string[] = [];
const rawPE: string[] = [];
Expand All @@ -40,6 +40,7 @@ export function initPatterns (textSelection: {str: string; startIndex: number; e
}
// rawPH.push(text.str.slice(text.endIndex))
for (let pattern of patterns) {
// console.log(pattern.name, selection)
if (pattern.pattern.test(selection)) {
patternTypes.add(pattern.name);
}
Expand All @@ -64,11 +65,11 @@ export function initPatterns (textSelection: {str: string; startIndex: number; e
}
}
} else {
const concatPattern = new RegExp(`${ph.source}(?<selection>.+?)${pe.source}`);
const concatPattern = new RegExp(`${ph.source}(?<selection>\\S+?)${pe.source}`);
return {
ph,
pe,
selection: /.+/,
selection: /\S+/,
pattern: concatPattern
}
}
Expand Down
122 changes: 85 additions & 37 deletions packages/rath-client/src/pages/dataSource/dataTable/index.tsx
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import React, { useCallback, useEffect, useState} from "react";
import React, { useCallback, useEffect, useMemo, useState} from "react";
import { ArtColumn, BaseTable, Classes } from "ali-react-table";
import styled from "styled-components";
import { observer } from 'mobx-react-lite'
Expand Down Expand Up @@ -30,13 +30,27 @@ const DataTable: React.FC = (props) => {
const { dataSourceStore } = useGlobalStore();
const { filteredDataMetaInfo, fieldsWithExtSug: fields, filteredDataStorage } = dataSourceStore;
const [filteredData, setFilteredData] = useState<IRow[]>([]);
const [textPattern, setTextPattern] = useState<{
const [textSelectList, setTextSelectList] = useState<{
fid: string;
str: string;
startIndex: number;
endIndex: number}[]>([]);
const textPattern = useMemo<{
fid: string;
ph: RegExp;
pe: RegExp;
selection: RegExp;
pattern: RegExp;
} | undefined>();
} | undefined>(() => {
if (textSelectList.length === 0) return;
const res = initPatterns(textSelectList);
if (res) {
return {
fid: textSelectList[0].fid,
...res
};
}
}, [textSelectList])
useEffect(() => {
if (filteredDataMetaInfo.versionCode === -1) {
setFilteredData([]);
Expand Down Expand Up @@ -100,31 +114,67 @@ const DataTable: React.FC = (props) => {
}
}
}
const onTextSelect = (fid: string, fullText: string) => {
const onTextSelect = (fid: string, fullText: string, td: Node) => {
// console.log('onTextSelect', fid, fullText, td)
const sl = document.getSelection();
// const fullText = sl?.focusNode?.nodeValue;
const selectedText = sl?.toString();
const range = sl?.getRangeAt(0);
if (!range)return;
const selectedText = range.toString();

// Create a range representing the selected text
const selectedRange = range.cloneRange();

// Create a range representing the full text of the element
const fullRange = document.createRange();
fullRange.selectNodeContents(td);

// Compare the selected range to the full range
const startPos = selectedRange.startOffset;//fullRange.compareBoundaryPoints(Range.START_TO_START, selectedRange);
const endPos = selectedRange.endOffset;//fullRange.compareBoundaryPoints(Range.END_TO_END, selectedRange);
if (fullText && selectedText) {
// console.log({
// fullText,
// selectedText,
// sl
// })
const startIndex = fullText.indexOf(selectedText);
const endIndex = startIndex + selectedText.length;
const res = initPatterns([{
const startIndex = startPos//fullText.indexOf(selectedText);
const endIndex = endPos//startIndex + selectedText.length;
setTextSelectList(l => l.concat({
fid,
str: fullText,
startIndex: startIndex,
endIndex: endIndex,
}])
if (res) {
setTextPattern({
fid,
...res
});
}
}));
}
}
const clearTextSelect = () => {
// setTextPattern(undefined);
setTextSelectList([]);
}
useEffect(() => {
if (textPattern?.fid) {
dataSourceStore.addExtSuggestions({
score: 10.1,
type: 'regex_selection',
apply: (fid) => dataSourceStore.expandFromRegex(fid, textPattern.pattern)
}, textPattern.fid);
}

}, [dataSourceStore, textPattern])

useEffect(() => {
// clear text pattern when ESC is pressed
const handleKeyDown = (e: KeyboardEvent) => {
if (e.key === 'Escape') {
clearTextSelect();
}
}
document.addEventListener('keydown', handleKeyDown);
return () => {
document.removeEventListener('keydown', handleKeyDown);
}

}, [])

const columns: ArtColumn[] = displayList.map((f, i) => {
const fm = (fields[i] && fields[i].fid === displayList[i].fid) ? fields[i] : fields.find(m => m.fid === f.fid);
Expand All @@ -147,43 +197,41 @@ const DataTable: React.FC = (props) => {
/>
),
};
if (textPattern && textPattern.fid === f.fid) {
col.render = (value: any) => {
const { ph, pe, selection, pattern } = textPattern;
const text: string = value?.toString() ?? '';
const match = pattern.exec(value)
col.render = (value: any) => {
const text: string = `${value}`;
if (textPattern && textPattern.fid === f.fid) {
const { pattern } = textPattern;
const patternForIndices = new RegExp(pattern.source, pattern.flags + 'd');
const match = patternForIndices.exec(text)

// console.log({ match, text, value, pattern, ph, pe, selection })
if (match) {
// @ts-ignore
const matched = match.groups['selection'];
if (!matched) return;
const start = text.indexOf(matched);
const end = start + matched.length;
const matchedRange = match.indices.groups['selection'];
if (!matchedRange) return;
const start = matchedRange[0];
const end = matchedRange[1]
const before = text.slice(0, start);
const after = text.slice(end);

return (
<span onMouseUp={() => {
onTextSelect(f.fid, `${value}`)
<span onMouseUp={(e) => {
onTextSelect(f.fid, `${text}`, e.currentTarget)
}}>
{before}
<span style={{ backgroundColor: '#FFC107' }}>
{matched}
{text.slice(start, end)}
</span>
{after}
</span>
);
}
return text;
}
} else {
col.render = (value: any) => {
return <span onMouseUp={() => {
onTextSelect(f.fid, `${value}`)
}}>
{value}
</span>
}
return <span onMouseUp={(e) => {
onTextSelect(f.fid, `${text}`, e.currentTarget)
}}>
{text}
</span>
}
return col;
})
Expand Down
59 changes: 59 additions & 0 deletions packages/rath-client/src/store/dataSourceStore.ts
Original file line number Diff line number Diff line change
Expand Up @@ -622,6 +622,28 @@ export class DataSourceStore {
return fieldWithExtSuggestions;
}

public addExtSuggestions (suggestion: FieldExtSuggestion, fid: string) {
const field = this.fieldMetas.find(f => f.fid === fid);
if (!field) {
return;
}
let which = this.fieldsWithExtSug.find(f => f.fid === fid);
if (!which) {
which = {
...field,
extSuggestions: []
}
this.fieldsWithExtSug.push(which);
}
const targetIndex = which.extSuggestions.findIndex(s => s.type === suggestion.type);
if (targetIndex > -1) {
which.extSuggestions.splice(targetIndex, 1);
}
which.extSuggestions.push(suggestion);
which.extSuggestions.sort((a, b) => b.score - a.score)
}


public canExpandAsDateTime(fid: string) {
const which = this.mutFields.find(f => f.fid === fid);
const expanded = Boolean(this.mutFields.find(
Expand Down Expand Up @@ -762,6 +784,43 @@ export class DataSourceStore {
})));
}
}
public async expandFromRegex (fid: string, pattern: RegExp) {
const originField = this.allFields.find(f => f.fid === fid);
if (!originField) {
return;
}
const data = await this.rawDataStorage.getAll();
const values: string[] = data.map(d => `${d[fid]}`);
const newField: IRawField = {
fid: `${fid}_regex_selection`,
name: `${originField.name}.selection`,
semanticType: 'nominal',
analyticType: 'dimension',
extInfo: {
extFrom: [fid],
extOpt: 'LaTiao.$regex',
extInfo: {
pattern: pattern.toString(),
}
},
geoRole: 'none'
}
const newData = data.map((d, index) => {
const match = values[index].match(pattern);
if (match) {
return {
...d,
[newField.fid]: match[0],
}
}
return d;
});
this.addExtFieldsFromRows(newData, [newField].map(f => ({
...f,
stage: 'preview',
})));
}

public async expandWordTF (fid: string) {
const data = await this.rawDataStorage.getAll();
const values: string[] = data.map(d => `${d[fid]}`);
Expand Down
4 changes: 2 additions & 2 deletions yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -3451,7 +3451,7 @@
resolved "https://registry.yarnpkg.com/@types/range-parser/-/range-parser-1.2.4.tgz#cd667bcfdd025213aafb7ca5915a932590acdcdc"
integrity sha512-EEhsLsD6UsDM1yFhAvy0Cjr6VwmpMWqFBCb9w07wVugF7w9nfajxLuVmngTIpgS6svCnm6Vaw+MZhoDCKnOfsw==

"@types/react-dom@^17.0.1":
"@types/react-dom@^17.0.1", "@types/react-dom@^17.x":
version "17.0.18"
resolved "https://registry.npmmirror.com/@types/react-dom/-/react-dom-17.0.18.tgz#8f7af38f5d9b42f79162eea7492e5a1caff70dc2"
integrity sha512-rLVtIfbwyur2iFKykP2w0pl/1unw26b5td16d5xMgp7/yjTHomkyxPYChFoCr/FtEX1lN9wY6lFj1qvKdS5kDw==
Expand Down Expand Up @@ -3482,7 +3482,7 @@
dependencies:
"@types/react" "*"

"@types/react@*", "@types/react@^17", "@types/react@^17.0.2":
"@types/react@*", "@types/react@^17", "@types/react@^17.0.2", "@types/react@^17.x":
version "17.0.52"
resolved "https://registry.npmmirror.com/@types/react/-/react-17.0.52.tgz#10d8b907b5c563ac014a541f289ae8eaa9bf2e9b"
integrity sha512-vwk8QqVODi0VaZZpDXQCmEmiOuyjEFPY7Ttaw5vjM112LOq37yz1CDJGrRJwA1fYEq4Iitd5rnjd1yWAc/bT+A==
Expand Down

0 comments on commit 6c54308

Please sign in to comment.