OpenClaw 是一个轻量级的文本信息提取框架,专注于从非结构化文本中提取结构化信息,以下是一个简化版本的设计。

核心架构
class OpenClaw:
def __init__(self):
self.extractors = {}
self.preprocessors = []
def add_extractor(self, name, extractor):
"""添加提取器"""
self.extractors[name] = extractor
def add_preprocessor(self, preprocessor):
"""添加预处理函数"""
self.preprocessors.append(preprocessor)
def extract(self, text, extractor_names=None):
"""执行信息提取"""
# 预处理
processed_text = text
for preprocessor in self.preprocessors:
processed_text = preprocessor(processed_text)
# 提取
results = {}
extractors_to_use = extractor_names if extractor_names else self.extractors.keys()
for name in extractors_to_use:
if name in self.extractors:
results[name] = self.extractors[name](processed_text)
return results
内置提取器
正则表达式提取器
import re
from typing import List, Dict
class RegexExtractor:
def __init__(self, patterns: Dict[str, str]):
"""
patterns: 模式字典 {字段名: 正则表达式}
"""
self.patterns = patterns
def __call__(self, text: str) -> Dict[str, List[str]]:
results = {}
for field, pattern in self.patterns.items():
matches = re.findall(pattern, text)
results[field] = matches
return results
关键词上下文提取器
class KeywordContextExtractor:
def __init__(self, keywords: List[str], window: int = 50):
self.keywords = keywords
self.window = window
def __call__(self, text: str) -> Dict[str, List[str]]:
results = {"contexts": []}
for keyword in self.keywords:
if keyword in text:
start = max(0, text.find(keyword) - self.window)
end = min(len(text), text.find(keyword) + len(keyword) + self.window)
context = text[start:end]
results["contexts"].append({
"keyword": keyword,
"context": context
})
return results
示例使用
# 创建OpenClaw实例
claw = OpenClaw()
# 添加预处理函数
def clean_text(text):
"""清理文本"""
import re
# 移除多余空格
text = re.sub(r'\s+', ' ', text)
# 移除特殊字符
text = re.sub(r'[^\w\s.,!?\-:]', '', text)
return text.strip()
claw.add_preprocessor(clean_text)
# 添加提取器 - 提取邮箱和电话
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
phone_pattern = r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b'
email_extractor = RegexExtractor({"emails": email_pattern})
phone_extractor = RegexExtractor({"phones": phone_pattern})
claw.add_extractor("emails", email_extractor)
claw.add_extractor("phones", phone_extractor)
# 添加关键词提取器
keyword_extractor = KeywordContextExtractor(["价格", "费用", "成本"], window=30)
claw.add_extractor("price_context", keyword_extractor)
# 使用示例
sample_text = """
联系人:张三,电话:138-1234-5678,邮箱:zhangsan@example.com
产品价格:2999元,安装费用:500元,总成本约3500元。
"""
result = claw.extract(sample_text)
print(result)
输出结果
{
"emails": {"emails": ["zhangsan@example.com"]},
"phones": {"phones": ["138-1234-5678"]},
"price_context": {
"contexts": [
{"keyword": "价格", "context": "产品价格:2999元,安装费用:500"},
{"keyword": "费用", "context": "价格:2999元,安装费用:500元,总成本"},
{"keyword": "成本", "context": "费用:500元,总成本约3500元。"}
]
}
}
规则引擎扩展
class RuleEngine:
def __init__(self):
self.rules = []
def add_rule(self, condition_func, action_func):
"""添加规则:条件函数和动作函数"""
self.rules.append((condition_func, action_func))
def apply(self, text, context=None):
"""应用所有规则"""
results = []
for condition, action in self.rules:
if condition(text, context):
results.append(action(text, context))
return results
# 示例规则
rule_engine = RuleEngine()
# 规则1:如果包含"发票"和"金额",提取金额
def has_invoice_amount(text, context):
return "发票" in text and "金额" in text
def extract_invoice_amount(text, context):
import re
amounts = re.findall(r'金额[::]?\s*(\d+(?:\.\d+)?)', text)
return {"type": "invoice_amount", "values": amounts}
rule_engine.add_rule(has_invoice_amount, extract_invoice_amount)
简单CRF模型接口(可选)
class SimpleCRFExtractor:
"""简化的CRF提取器(需要安装sklearn-crfsuite)"""
def __init__(self, model_path=None):
self.model = None
if model_path:
self.load_model(model_path)
def load_model(self, model_path):
# 加载预训练模型
import pickle
with open(model_path, 'rb') as f:
self.model = pickle.load(f)
def extract(self, text):
if not self.model:
raise ValueError("模型未加载")
# 将文本转换为特征
features = self.text_to_features(text)
# 预测标签
labels = self.model.predict([features])[0]
# 提取实体
entities = self.labels_to_entities(text, labels)
return entities
def text_to_features(self, text):
# 简化的特征提取(实际应用需要更复杂的特征工程)
words = text.split()
features = []
for i, word in enumerate(words):
feature = {
'word': word,
'word_lower': word.lower(),
'is_digit': word.isdigit(),
'prefix': word[:3] if len(word) >= 3 else word,
'suffix': word[-3:] if len(word) >= 3 else word,
'prev_word': words[i-1] if i > 0 else '<START>',
'next_word': words[i+1] if i < len(words)-1 else '<END>'
}
features.append(feature)
return features
def labels_to_entities(self, text, labels):
# 将标签序列转换为实体
entities = []
current_entity = None
words = text.split()
for i, (word, label) in enumerate(zip(words, labels)):
if label.startswith('B-'):
if current_entity:
entities.append(current_entity)
current_entity = {
'text': word,
'type': label[2:],
'start': i
}
elif label.startswith('I-') and current_entity:
current_entity['text'] += ' ' + word
elif label == 'O' and current_entity:
current_entity['end'] = i - 1
entities.append(current_entity)
current_entity = None
return entities
使用示例
# 完整示例
def main():
# 1. 初始化
claw = OpenClaw()
# 2. 添加预处理
claw.add_preprocessor(clean_text)
# 3. 配置提取器
patterns = {
"date": r'\d{4}[-/]\d{1,2}[-/]\d{1,2}',
"money": r'¥\s*\d+(?:\.\d+)?|\$\s*\d+(?:\.\d+)?',
"percentage": r'\d+(?:\.\d+)?%'
}
claw.add_extractor("patterns", RegexExtractor(patterns))
# 4. 提取信息
text = "会议日期:2024-01-15,预算:$5000,完成率:85.5%"
results = claw.extract(text)
# 5. 输出结果
print("提取结果:")
for key, value in results.items():
print(f"{key}: {value}")
if __name__ == "__main__":
main()
这个精简版 OpenClaw 提供了:
- 模块化设计:可轻松添加新的提取器和预处理器
- 多种提取方式:正则表达式、关键词上下文、规则引擎
- 可扩展性:支持集成更复杂的机器学习模型
- 轻量级:无需复杂依赖,核心功能简洁明了
可以根据具体需求进一步扩展功能,如添加:
- 更多内置提取器
- 结果后处理
- 提取结果验证
- 多语言支持
- 批量处理功能
版权声明:除非特别标注,否则均为本站原创文章,转载时请以链接形式注明文章出处。