NNLM语言模型python实现(例子:基于中文语料)

时间:2020-8-28 作者:admin


原理图

NNLM语言模型python实现(例子:基于中文语料)

运用场景:

知道句子的前N-1个词,来预测第N个词。

网络的流程:

1.词语one-hot编码—————2.projection_layer层————3.hidden_layer层——————4.SoftMax层

1准备工作

这个代码是我在GitHub上看到的,原代码是针对英文,英文比较简单。我修改下针对中文。并给出Keras版代码。


import numpy as np
import tensorflow as tf
import re
sentences = [ "我爱你", "余登武", "范冰冰"]

#分字
def seg_char(sent):
    pattern = re.compile(r'([\u4e00-\u9fa5])')
    chars = pattern.split(sent)
    chars =[w for w in chars if len(w.strip()) > 0]
    return chars

chars=np.array([seg_char(i)for i in sentences])
chars=chars.reshape(1,-1)
#chars[['我' '爱' '你' '余' '登' '武' '范' '冰' '冰']]
word_list=np.squeeze(chars)#降维
#word_list['我' '爱' '你' '余' '登' '武' '范' '冰' '冰']
word_list = list(set(word_list))
word_dict = {w: i for i, w in enumerate(word_list)}
#word_dict{'余': 0, '武': 1, '你': 2, '范': 3, '登': 4, '我': 5, '冰': 6, '爱': 7}
number_dict = {i: w for i, w in enumerate(word_list)}
#number_dict{0: '登', 1: '武', 2: '冰', 3: '我', 4: '余', 5: '范', 6: '你', 7: '爱'}
n_class = len(word_dict) # number of Vocabulary

2输入输出one-hot编码

# NNLM Parameter
n_step = 2 # number of steps ['我 爱', '范 冰', '余 登']
n_hidden = 2 # number of hidden units


def make_batch(sentences):
    input_batch = []
    target_batch = []

    for sen in sentences:
        word = seg_char(sen)#分字
        input = [word_dict[n] for n in word[:-1]]
        target = word_dict[word[-1]]

        input_batch.append(np.eye(n_class)[input])
        target_batch.append(np.eye(n_class)[target])

    return input_batch, target_batch

input_batch, target_batch=make_batch(sentences)

NNLM语言模型python实现(例子:基于中文语料)
NNLM语言模型python实现(例子:基于中文语料)

3模型

# Model
X = tf.placeholder(tf.float32, [None, n_step, n_class]) # [batch_size, number of steps, number of Vocabulary]
Y = tf.placeholder(tf.float32, [None, n_class])

input = tf.reshape(X, shape=[-1, n_step * n_class]) # [batch_size, n_step * n_class]
H = tf.Variable(tf.random_normal([n_step * n_class, n_hidden]))
d = tf.Variable(tf.random_normal([n_hidden]))
U = tf.Variable(tf.random_normal([n_hidden, n_class]))
b = tf.Variable(tf.random_normal([n_class]))

tanh = tf.nn.tanh(d + tf.matmul(input, H)) # [batch_size, n_hidden]
model = tf.matmul(tanh, U) + b # [batch_size, n_class]

cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=model, labels=Y))
optimizer = tf.train.AdamOptimizer(0.001).minimize(cost)
prediction =tf.argmax(model, 1)

4训练

# Training
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)



for epoch in range(5000):
    _, loss = sess.run([optimizer, cost], feed_dict={X: input_batch, Y: target_batch})
    if (epoch + 1)%1000 == 0:
        print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))

# Predict
predict =  sess.run([prediction], feed_dict={X: input_batch})

5测试

# Test
input = [seg_char(sen)[:2] for sen in sentences]
print([seg_char(sen)[:2] for sen in sentences], '预测得到->', [number_dict[n] for n in predict[0]])

NNLM语言模型python实现(例子:基于中文语料)

全文代码

import numpy as np
import tensorflow as tf
import re
sentences = [ "我爱你", "余登武", "范冰冰"]


def seg_char(sent):
    pattern = re.compile(r'([\u4e00-\u9fa5])')
    chars = pattern.split(sent)
    chars =[w for w in chars if len(w.strip()) > 0]
    return chars

chars=np.array([seg_char(i)for i in sentences])
chars=chars.reshape(1,-1)
word_list=np.squeeze(chars)
##word_list['我' '爱' '你' '余' '登' '武' '范' '冰' '冰']
word_list = list(set(word_list))
word_dict = {w: i for i, w in enumerate(word_list)}
#word_dict{'余': 0, '武': 1, '你': 2, '范': 3, '登': 4, '我': 5, '冰': 6, '爱': 7}
number_dict = {i: w for i, w in enumerate(word_list)}
#{0: '登', 1: '武', 2: '冰', 3: '我', 4: '余', 5: '范', 6: '你', 7: '爱'}
n_class = len(word_dict) # number of Vocabulary

# NNLM Parameter
n_step = 2 # number of steps ['我 爱', '范 冰', '余 登']
n_hidden = 2 # number of hidden units


def make_batch(sentences):
    input_batch = []
    target_batch = []

    for sen in sentences:
        word = seg_char(sen)#分字
        input = [word_dict[n] for n in word[:-1]]
        target = word_dict[word[-1]]

        input_batch.append(np.eye(n_class)[input])
        target_batch.append(np.eye(n_class)[target])

    return input_batch, target_batch

input_batch, target_batch=make_batch(sentences)


# Model
X = tf.placeholder(tf.float32, [None, n_step, n_class]) # [batch_size, number of steps, number of Vocabulary]
Y = tf.placeholder(tf.float32, [None, n_class])

input = tf.reshape(X, shape=[-1, n_step * n_class]) # [batch_size, n_step * n_class]
H = tf.Variable(tf.random_normal([n_step * n_class, n_hidden]))
d = tf.Variable(tf.random_normal([n_hidden]))
U = tf.Variable(tf.random_normal([n_hidden, n_class]))
b = tf.Variable(tf.random_normal([n_class]))

tanh = tf.nn.tanh(d + tf.matmul(input, H)) # [batch_size, n_hidden]
model = tf.matmul(tanh, U) + b # [batch_size, n_class]

cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=model, labels=Y))
optimizer = tf.train.AdamOptimizer(0.001).minimize(cost)
prediction =tf.argmax(model, 1)


# Training
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)



for epoch in range(5000):
    _, loss = sess.run([optimizer, cost], feed_dict={X: input_batch, Y: target_batch})
    if (epoch + 1)%1000 == 0:
        print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))

# Predict
predict =  sess.run([prediction], feed_dict={X: input_batch})

# Test
input = [seg_char(sen)[:2] for sen in sentences]
print([seg_char(sen)[:2] for sen in sentences], '预测得到->', [number_dict[n] for n in predict[0]])

Keras 版代码

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @Author: yudengwu
# @Date  : 2020/8/26

from keras.models import Sequential

import numpy as np
import tensorflow as tf
import re
sentences = [ "我爱你", "余登武", "范冰冰"]

#分字
def seg_char(sent):
    pattern = re.compile(r'([\u4e00-\u9fa5])')
    chars = pattern.split(sent)
    chars =[w for w in chars if len(w.strip()) > 0]
    return chars

chars=np.array([seg_char(i)for i in sentences])
chars=chars.reshape(1,-1)
word_list=np.squeeze(chars)
##word_list['我' '爱' '你' '余' '登' '武' '范' '冰' '冰']
word_list = list(set(word_list))
word_dict = {w: i for i, w in enumerate(word_list)}
#word_dict{'余': 0, '武': 1, '你': 2, '范': 3, '登': 4, '我': 5, '冰': 6, '爱': 7}
number_dict = {i: w for i, w in enumerate(word_list)}
#{0: '登', 1: '武', 2: '冰', 3: '我', 4: '余', 5: '范', 6: '你', 7: '爱'}
n_class = len(word_dict) # number of Vocabulary

# NNLM Parameter
n_step = 2 # number of steps ['我 爱', '范 冰', '余 登']


#输入输出onr-hot化
def make_batch(sentences):
    input_batch = []
    target_batch = []

    for sen in sentences:
        word = seg_char(sen)#分字
        input = [word_dict[n] for n in word[:-1]]
        target = word_dict[word[-1]]

        input_batch.append(np.eye(n_class)[input])
        target_batch.append(np.eye(n_class)[target])

    return input_batch, target_batch

input_batch, target_batch=make_batch(sentences)
input_batch=np.array(input_batch)
input_batch=input_batch.reshape(-1,n_step*n_class)
target_batch=np.array(target_batch)
target_batch=target_batch.reshape(-1,n_class)



from keras.layers import Dense
import keras


#模型
def define_model():
    model = Sequential()

    model.add(Dense(2,activation='tanh',input_shape=(n_step*n_class,)))
    model.add(Dense(n_class, activation='softmax'))  # 输出层
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    model.summary()
    return model


model=define_model()
model.fit(input_batch, target_batch, epochs=5000)#训练5000轮,数据少啦,一两轮没效果


#预测测试

predict=model.predict(input_batch)
predict=np.argmax(predict,1)#求取最大值索引

print('输入的是:',[seg_char(sen)[:2] for sen in sentences])
print('预测得到:',[number_dict[n] for n in predict])

NNLM语言模型python实现(例子:基于中文语料)

NNLM 缺点:如果词语太多,会造成维度过大问题。
可以取出中间层权重 即wordvec方法。
NNLM也可以做文本生成的。for循环,加随机采样。
可以看下另一篇博客文本生成。

古诗文本自动生成唐诗文本生成(算例代码)

NNLM语言模型python实现(例子:基于中文语料)
电气专业的计算机萌新,写博文不容易。如果你觉得本文对你有用,请点个赞支持下,谢谢。

NNLM语言模型python实现(例子:基于中文语料)

声明:本文内容由互联网用户自发贡献自行上传,本网站不拥有所有权,未作人工编辑处理,也不承担相关法律责任。如果您发现有涉嫌版权的内容,欢迎进行举报,并提供相关证据,工作人员会在5个工作日内联系你,一经查实,本站将立刻删除涉嫌侵权内容。