博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
keras句子分类 keras_demo_for_sentence_classification (simplified version)
阅读量:4070 次
发布时间:2019-05-25

本文共 10909 字,大约阅读时间需要 36 分钟。

'''This script loads pre-trained word embeddings(word2vec embeddings)into a Keras Embedding layer, and uses it to train a text classification model on a customized dataset.'''from __future__ import print_functionfrom collections import defaultdictimport osimport numpy as npimport pandas as pdnp.random.seed(1337)from keras.preprocessing.text import Tokenizerfrom keras.preprocessing.sequence import pad_sequencesfrom keras.utils.np_utils import to_categoricalfrom keras.layers import Dense, Input, Flattenfrom keras.layers import Conv1D, MaxPooling1D, Embeddingfrom keras.layers import Convolution1D, Dropout, Activation from keras.models import Sequentialfrom keras.models import Modelw2v_file = 'G:/pre_trained word embeddings/word2vec/vectors.bin'train_data = './cqa_title/traindata/userprofilepythontitle.txt'test_data = './cqa_title/testdata/TestQuestionsPythonTitle.txt'EMBEDDING_DIM = 400MAX_SEQUENCE_LENGTH = 25NB_FILTER = 128FILTER_LENGTH = 5def build_data_cv(clean_string=False):    """    Loads data.    """    revs = []    vocab = defaultdict(float)    with open(train_data, "r") as f:        for line in f:            rev = []            rev.append(line.strip())            if clean_string:                orig_rev = clean_str(" ".join(rev))            else:                orig_rev = " ".join(rev).lower()            words = orig_rev.split()            n_y = int(words[0])            for word in words[1:]:                    vocab[word] += 1            datum  = {
"y":n_y-1, "text": " ".join(words[1:]), "num_words": len(words)-1, "split": 0} revs.append(datum) with open(test_data, "r") as f: for line in f: rev = [] rev.append(line.strip()) if clean_string: orig_rev = clean_str(" ".join(rev)) else: orig_rev = " ".join(rev).lower() words = orig_rev.split() n_y = int(words[0]) for word in words[1:]: vocab[word] += 1 datum = {
"y":n_y-1, "text": " ".join(words[1:]), "num_words": len(words)-1, "split": 1} revs.append(datum) return revs, vocabdef clean_str(string, TREC=False): """ Tokenization/string cleaning for all datasets except for SST. Every dataset is lower cased except for TREC """ string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) string = re.sub(r"\'s", " \'s", string) string = re.sub(r"\'ve", " \'ve", string) string = re.sub(r"n\'t", " n\'t", string) string = re.sub(r"\'re", " \'re", string) string = re.sub(r"\'d", " \'d", string) string = re.sub(r"\'ll", " \'ll", string) string = re.sub(r",", " , ", string) string = re.sub(r"!", " ! ", string) string = re.sub(r"\(", " \( ", string) string = re.sub(r"\)", " \) ", string) string = re.sub(r"\?", " \? ", string) string = re.sub(r"\s{2,}", " ", string) return string.strip() if TREC else string.strip().lower()def load_bin_vec(fname, vocab): """ Loads 300x1 word vecs from Google (Mikolov) word2vec """ word_vecs = {} with open(fname, "rb") as f: header = f.readline() vocab_size, layer1_size = map(int, header.split()) binary_len = np.dtype('float32').itemsize * layer1_size for line in xrange(vocab_size): word = [] while True: ch = f.read(1) if ch == ' ': word = ''.join(word) break if ch != '\n': word.append(ch) if word in vocab: word_vecs[word] = np.fromstring(f.read(binary_len), dtype='float32') else: f.read(binary_len) return word_vecsdef add_unknown_words(word_vecs, vocab, min_df=1, k=EMBEDDING_DIM): """ For words that occur in at least min_df documents, create a separate word vector. 0.25 is chosen so the unknown vectors have (approximately) same variance as pre-trained ones """ for word in vocab: if word not in word_vecs and vocab[word] >= min_df: word_vecs[word] = np.random.uniform(-0.25,0.25,k) return word_vecsdef get_W(word_vecs, k=EMBEDDING_DIM): """ Get word matrix. W[i] is the vector for word indexed by i """ vocab_size = len(word_vecs) word_idx_map = dict() W = np.zeros(shape=(vocab_size+1, k), dtype='float32') W[0] = np.zeros(k, dtype='float32') i = 1 for word in word_vecs: W[i] = word_vecs[word] word_idx_map[word] = i i += 1 return W, word_idx_map
Using Theano backend.
print ("loading data...")revs, vocab = build_data_cv()df = pd.DataFrame(revs)max_l = np.max(df)["num_words"]print ("data loaded!")print ("number of sentences: " + str(len(revs)))print ("vocab size: " + str(len(vocab)))print ("max sentence length: " + str(max_l))
loading data...data loaded!number of sentences: 252313vocab size: 40441max sentence length: 37
print ("loading data...")revs, vocab = build_data_cv()
loading data...
print (type(revs), len(revs),type(vocab),len(vocab))
252313
40441
print (revs[0])
{'y': 0, 'text': 'parsing and modification of sql statements in java', 'split': 0, 'num_words': 8}
小结一: revs是存数据的列表—(252313条数据),vocab是词表–字典(40441个单词)
print ("loding word2vec vectors...")w2v = load_bin_vec(w2v_file, vocab)print ("word2vec loaded!")print ("num words already in word2vec: " + str(len(w2v)))w2v = add_unknown_words(w2v, vocab)W, word_idx_map = get_W(w2v)print ("dataset creaded!")
loding word2vec vectors...word2vec loaded!num words already in word2vec: 28520dataset creaded!
type(w2v) #word embeddings
dict
len(w2v)
40441
print (type(W),len(W))print (W.shape)
40442(40442L, 400L)
小结二:词向量是字典–w2v 转换成np.ndarray–W, W前面补一个向量是因为,句子存在补零,那个向量对应补零位
texts = df.texttokenizer = Tokenizer(nb_words=len(vocab))tokenizer.fit_on_texts(texts)sequences =tokenizer.texts_to_sequences(texts)word_index = tokenizer.word_indexprint ("Found %s unique tokens." % len(word_index))data = pad_sequences(sequences, maxlen = MAX_SEQUENCE_LENGTH)labels = df.ylabels = to_categorical(np.asarray(labels))print ("Shape of the data tensor:", data.shape)print("Shape of the label tensor:", labels.shape)
Found 40435 unique tokens.Shape of the data tensor: (252313L, 25L)Shape of the label tensor: (252313L, 2064L)
print (type(data),len(data), type(labels))
252313
print(type(data[0]), len(data[0]), data[0].shape)print(data[0])print(data[1])print(data[2]) # keras's pad_sequences 补零在前??
25 (25L,)[ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 385 8 2985 6 26 727 2 27][ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 13 234 1 4315 1 5683][ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 20 9 5 432 168 78 124 385 1913]
小结三: data是pad完的文本,np.ndarray(252313条,长度25),labels是data对应的标签(252313,2064)
x_train = data[:248300]y_train = labels[:248300]x_test = data[248300:]y_test = labels[248300:]
#lod pre-trained word embeddings into an Embedding layer#note that we set trainable = True so as to fine tune the embeddings embedding_layer = Embedding(len(vocab) + 1,                            EMBEDDING_DIM,                            weights=[W],                            input_length=MAX_SEQUENCE_LENGTH,                            trainable=True)
小结四:vocab是词表——字典,加一是因为pad补零。EMBEDDING_DIM是词向量维度400。weights是词向量矩阵(40442,400)。输入长度25.
print ("Training model.")model = Sequential()model.add(embedding_layer)model.add(Convolution1D(nb_filter=NB_FILTER,                        filter_length=FILTER_LENGTH,                        border_mode='valid',                        activation='relu',                        subsample_length=1))# use max pooling:model.add(MaxPooling1D(pool_length=model.output_shape[1]))# We flatten the output of the conv layer,model.add(Dropout(0.5))model.add(Activation('relu'))# so that we can add a vanilla dense layer:model.add(Flatten())# We add a vanilla hidden layer:model.add(Dense(128, activation='relu'))# We project onto a single unit output layer, and squash it with a sigmoid:model.add(Dense(len(labels[0]), activation='softmax'))model.summary()
Training model.____________________________________________________________________________________________________Layer (type)                     Output Shape          Param #     Connected to                     ====================================================================================================embedding_1 (Embedding)          (None, 25, 400)       16176800    embedding_input_1[0][0]          ____________________________________________________________________________________________________convolution1d_1 (Convolution1D)  (None, 21, 128)       256128      embedding_1[0][0]                ____________________________________________________________________________________________________maxpooling1d_1 (MaxPooling1D)    (None, 1, 128)        0           convolution1d_1[0][0]            ____________________________________________________________________________________________________dropout_1 (Dropout)              (None, 1, 128)        0           maxpooling1d_1[0][0]             ____________________________________________________________________________________________________activation_1 (Activation)        (None, 1, 128)        0           dropout_1[0][0]                  ____________________________________________________________________________________________________flatten_1 (Flatten)              (None, 128)           0           activation_1[0][0]               ____________________________________________________________________________________________________dense_1 (Dense)                  (None, 128)           16512       flatten_1[0][0]                  ____________________________________________________________________________________________________dense_2 (Dense)                  (None, 2064)          266256      dense_1[0][0]                    ====================================================================================================Total params: 16,715,696Trainable params: 16,715,696Non-trainable params: 0____________________________________________________________________________________________________
小结五:参数计算。
  • embedding层:40442 * 400 = 16176800
  • convolotion层:128 * 5 * 400 + 128 = 256128
  • 倒数第二层,全连接层:128 * 2064 + 2064 = 266256
  • 倒数第一层,softmax层: 128 * 128 + 128 = 16512
model.compile(loss='categorical_crossentropy',              optimizer='adam',              metrics=['acc'])model.fit(x_train, y_train, validation_data=(x_test, y_test),          nb_epoch=20, batch_size=128)

转载地址:http://ajoji.baihongyu.com/

你可能感兴趣的文章
拍美女看这里人像摄影终极POSE指南
查看>>
maven快速搭建
查看>>
eclips 中maketplace下安装maven方法
查看>>
搭建Nexus + Maven环境
查看>>
Eclipse 关闭 updating indexes
查看>>
windows 添加制定路由通过制定网关出去
查看>>
maven下搭建struts2
查看>>
maven下搭建hibernate体验
查看>>
maven 下搭建spring
查看>>
maven下搭建注解形式的Spring aop
查看>>
SSH性能优化 转
查看>>
spring 事务
查看>>
Linux web工程部署远程必备软件安装
查看>>
Windows 7下硬盘安装CentOS6.4的解决方法
查看>>
JSP不支持EL表达式的解决方案
查看>>
Struts2类型转换器
查看>>
struts2 使用注解、反射、拦截器实现基于方法的权限控制
查看>>
maven 架设 struts2 注解方式 权限控制
查看>>
struts2之多个文件上传
查看>>
struts2之单个文件上传
查看>>