import re
def split_text_into_batches(text, max_tokens_per_batch):
# 定义一个正则表达式,在中文标点符号处拆分句子
sentence_splitter = re.compile(r'(?<=[。!?])')
# 将文本拆分为句子
sentences = [sentence.strip() for sentence in sentence_splitter.split(text) if sentence.strip()]
# 初始化变量
batches = []
current_batch = ""
for sentence in sentences:
if len(current_batch) + len(sentence) <= max_tokens_per_batch:
current_batch += sentence + " "
else:
# 找到距离 max_tokens_per_batch 限制最近的标点符号
last_punctuation_index = max(current_batch.rfind('。'), current_batch.rfind('!'), current_batch.rfind('?'))
# 如果限制范围内没有标点符号,就在最后一个空格处拆分
split_index = last_punctuation_index if last_punctuation_index != -1 else current_batch.rfind(' ')
# 将批次添加到拆分索引处
batches.append(current_batch[:split_index].strip())
# 新批次从拆分索引开始
current_batch = sentence + " "
if current_batch.strip(): # 确保不将空字符串添加到批次中
batches.append(current_batch.strip())
return batches
text = ""
max_tokens_per_batch = 20
batches = split_text_into_batches(text, max_tokens_per_batch)
print("Batches:", batches)
import re
import nltk
import jieba
nltk.download('punkt')
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
def clean_html_tags(text):
clean_text = re.sub(r'<.*?>', '', text)
return clean_text
def remove_links(text):
clean_text = re.sub(r'http\S+', '', text)
return clean_text
def remove_special_characters(text):
clean_text = ''.join(char for char in text if char not in string.punctuation)
return clean_text
def remove_extra_whitespace(text):
clean_text = ' '.join(text.split())
return clean_text
def remove_stopwords(text):
stop_words = set(stopwords.words('english'))
word_tokens = word_tokenize(text)
clean_text = ' '.join(word for word in word_tokens if word.lower() not in stop_words)
return clean_text
def clean_chinese_text(text):
# 清除HTML标签
cleaned_text = clean_html_tags(text)
# 去除链接
cleaned_text = remove_links(cleaned_text)
# 去除特殊字符
cleaned_text = remove_special_characters(cleaned_text)
# 去除额外的空白
cleaned_text = remove_extra_whitespace(cleaned_text)
# 去除停用词
cleaned_text = remove_stopwords(cleaned_text)
# 使用jieba进行分词
word_list = jieba.lcut(cleaned_text)
# 拼接成清洗后的文本
cleaned_text = ' '.join(word_list)
return cleaned_text
input_text =""
cleaned_text = clean_chinese_text(input_text)
print(cleaned_text)