Skip to content

Commit

Permalink
Merge pull request #1043 from PyThaiNLP/llm-tool
Browse files Browse the repository at this point in the history
Add pythainlp.llm
  • Loading branch information
wannaphong authored Dec 27, 2024
2 parents 73e38be + 4d465d2 commit 26f1982
Show file tree
Hide file tree
Showing 4 changed files with 88 additions and 0 deletions.
9 changes: 9 additions & 0 deletions docs/api/llm.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
.. currentmodule:: pythainlp.llm

pythainlp.llm
=============

Modules
-------

.. autofunction:: remove_repeated_ngrams
8 changes: 8 additions & 0 deletions pythainlp/llm/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
# SPDX-FileType: SOURCE
# SPDX-License-Identifier: Apache-2.0

__all__ = ["remove_repeated_ngrams"]

from pythainlp.llm.text_util import remove_repeated_ngrams
50 changes: 50 additions & 0 deletions pythainlp/llm/text_util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
# SPDX-FileType: SOURCE
# SPDX-License-Identifier: Apache-2.0
# ruff: noqa: C901

from typing import List


def remove_repeated_ngrams(string_list: List[str], n: int = 2) -> List[str]:
"""
Remove repeated n-grams
:param List[str] string_list: List of string
:param int n: n-gram size
:return: List of string
:rtype: List[str]
:Example:
::
from pythainlp.llm import remove_repeated_ngrams
remove_repeated_ngrams(['เอา', 'เอา', 'แบบ', 'ไหน'], n=1)
# output: ['เอา', 'แบบ', 'ไหน']
"""
if not string_list or n <= 0:
return string_list

unique_ngrams = set()

output_list = []

for i in range(len(string_list)):
if i + n <= len(string_list):
ngram = tuple(string_list[i:i + n])

if ngram not in unique_ngrams:
unique_ngrams.add(ngram)

if not output_list or output_list[-(n - 1):] != list(ngram[:-1]):
output_list.extend(ngram)
else:
output_list.append(ngram[-1])
else:
for char in string_list[i:]:
if not output_list or output_list[-1] != char:
output_list.append(char)

return output_list
21 changes: 21 additions & 0 deletions tests/core/test_llm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
# SPDX-FileType: SOURCE
# SPDX-License-Identifier: Apache-2.0

import unittest

from pythainlp.llm import remove_repeated_ngrams


class LlmTestCase(unittest.TestCase):
def test_remove_repeated_ngrams(self):
texts = ['เอา', 'เอา', 'แบบ', 'แบบ', 'แบบ', 'ไหน']
self.assertEqual(
remove_repeated_ngrams(texts, n=1),
['เอา', 'แบบ', 'ไหน']
)
self.assertEqual(
remove_repeated_ngrams(texts, n=2),
['เอา', 'เอา', 'แบบ', 'แบบ', 'ไหน']
)

0 comments on commit 26f1982

Please sign in to comment.