PaddleHub使用示例
2020-04-12
2 min read
最近用了一下PaddleHub,感觉还挺好用的。这里两个使用PaddleHub的示例。
分词
这个分词和官网的分词效果一样,觉得比jieba之类的要好。
# pip install pyahocorasick
# https://www.paddlepaddle.org.cn/hubdetail?name=lac&en_category=LexicalAnalysis
import paddlehub as hub
temp_user_dict = [
dict(word='自然', tag='n', freq='10000')
]
def make_dict(user_dicts):
with open('user.dict', 'w') as f:
for user_dict in user_dicts:
f.write(user_dict['word'] + '\t' +
user_dict['tag'] + '\t' +
user_dict['freq'] + '\n')
make_dict(temp_user_dict)
lac = hub.Module(name='lac')
lac.set_user_dict(dict_path='user.dict')
results = lac.lexical_analysis(texts=['我爱自然语言处理'],
use_gpu=False,
batch_size=1,
return_tag=True)
for result in results:
print(result["word"])
print(result["tag"])
阅读理解
import paddlehub as hub
module = hub.Module(name="roberta_wwm_ext_chinese_L-24_H-1024_A-16")
inputs, outputs, program = module.context(trainable=True, max_seq_len=384)
dataset = hub.dataset.CMRC2018()
reader = hub.reader.ReadingComprehensionReader(
dataset=dataset,
vocab_path=module.get_vocab_path(),
max_seq_len=384)
strategy = hub.AdamWeightDecayStrategy(
learning_rate=5e-5,
weight_decay=0.01,
warmup_proportion=0.1
)
config = hub.RunConfig(use_cuda=False, num_epoch=2, batch_size=12, strategy=strategy)
seq_output = outputs["sequence_output"]
# feed_list的Tensor顺序不可以调整
feed_list = [
inputs["input_ids"].name,
inputs["position_ids"].name,
inputs["segment_ids"].name,
inputs["input_mask"].name,
]
reading_comprehension_task = hub.ReadingComprehensionTask(
data_reader=reader,
feature=seq_output,
feed_list=feed_list,
config=config,
sub_task="cmrc2018")
reading_comprehension_task.finetune_and_eval()