keras句子分类 keras_demo_for_sentence_classification (simplified version)-白红宇

keras句子分类 keras_demo_for_sentence_classification (simplified version)

阅读量：4070 次

发布时间：2019-05-25

本文共 10909 字，大约阅读时间需要 36 分钟。

'''This script loads pre-trained word embeddings(word2vec embeddings)into a Keras Embedding layer, and uses it to train a text classification model on a customized dataset.'''from __future__ import print_functionfrom collections import defaultdictimport osimport numpy as npimport pandas as pdnp.random.seed(1337)from keras.preprocessing.text import Tokenizerfrom keras.preprocessing.sequence import pad_sequencesfrom keras.utils.np_utils import to_categoricalfrom keras.layers import Dense, Input, Flattenfrom keras.layers import Conv1D, MaxPooling1D, Embeddingfrom keras.layers import Convolution1D, Dropout, Activation from keras.models import Sequentialfrom keras.models import Modelw2v_file = 'G:/pre_trained word embeddings/word2vec/vectors.bin'train_data = './cqa_title/traindata/userprofilepythontitle.txt'test_data = './cqa_title/testdata/TestQuestionsPythonTitle.txt'EMBEDDING_DIM = 400MAX_SEQUENCE_LENGTH = 25NB_FILTER = 128FILTER_LENGTH = 5def build_data_cv(clean_string=False):    """    Loads data.    """    revs = []    vocab = defaultdict(float)    with open(train_data, "r") as f:        for line in f:            rev = []            rev.append(line.strip())            if clean_string:                orig_rev = clean_str(" ".join(rev))            else:                orig_rev = " ".join(rev).lower()            words = orig_rev.split()            n_y = int(words[0])            for word in words[1:]:                    vocab[word] += 1            datum  = {
  "y":n_y-1,                       "text": " ".join(words[1:]),                                                   "num_words": len(words)-1,                      "split": 0}            revs.append(datum)    with open(test_data, "r") as f:        for line in f:            rev = []            rev.append(line.strip())            if clean_string:                orig_rev = clean_str(" ".join(rev))            else:                orig_rev = " ".join(rev).lower()            words = orig_rev.split()            n_y = int(words[0])            for word in words[1:]:                    vocab[word] += 1            datum  = {
  "y":n_y-1,                       "text": " ".join(words[1:]),                                                   "num_words": len(words)-1,                      "split": 1}            revs.append(datum)    return revs, vocabdef clean_str(string, TREC=False):    """    Tokenization/string cleaning for all datasets except for SST.    Every dataset is lower cased except for TREC    """    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)         string = re.sub(r"\'s", " \'s", string)     string = re.sub(r"\'ve", " \'ve", string)     string = re.sub(r"n\'t", " n\'t", string)     string = re.sub(r"\'re", " \'re", string)     string = re.sub(r"\'d", " \'d", string)     string = re.sub(r"\'ll", " \'ll", string)     string = re.sub(r",", " , ", string)     string = re.sub(r"!", " ! ", string)     string = re.sub(r"\(", " \( ", string)     string = re.sub(r"\)", " \) ", string)     string = re.sub(r"\?", " \? ", string)     string = re.sub(r"\s{2,}", " ", string)        return string.strip() if TREC else string.strip().lower()def load_bin_vec(fname, vocab):    """    Loads 300x1 word vecs from Google (Mikolov) word2vec    """    word_vecs = {}    with open(fname, "rb") as f:        header = f.readline()        vocab_size, layer1_size = map(int, header.split())        binary_len = np.dtype('float32').itemsize * layer1_size        for line in xrange(vocab_size):            word = []            while True:                ch = f.read(1)                if ch == ' ':                    word = ''.join(word)                    break                if ch != '\n':                    word.append(ch)               if word in vocab:               word_vecs[word] = np.fromstring(f.read(binary_len), dtype='float32')              else:                f.read(binary_len)    return word_vecsdef add_unknown_words(word_vecs, vocab, min_df=1, k=EMBEDDING_DIM):    """    For words that occur in at least min_df documents, create a separate word vector.        0.25 is chosen so the unknown vectors have (approximately) same variance as pre-trained ones    """    for word in vocab:        if word not in word_vecs and vocab[word] >= min_df:            word_vecs[word] = np.random.uniform(-0.25,0.25,k)    return word_vecsdef get_W(word_vecs, k=EMBEDDING_DIM):    """    Get word matrix. W[i] is the vector for word indexed by i    """    vocab_size = len(word_vecs)    word_idx_map = dict()    W = np.zeros(shape=(vocab_size+1, k), dtype='float32')                W[0] = np.zeros(k, dtype='float32')    i = 1    for word in word_vecs:        W[i] = word_vecs[word]        word_idx_map[word] = i        i += 1    return W, word_idx_map

Using Theano backend.

print ("loading data...")revs, vocab = build_data_cv()df = pd.DataFrame(revs)max_l = np.max(df)["num_words"]print ("data loaded!")print ("number of sentences: " + str(len(revs)))print ("vocab size: " + str(len(vocab)))print ("max sentence length: " + str(max_l))

loading data...data loaded!number of sentences: 252313vocab size: 40441max sentence length: 37

print ("loading data...")revs, vocab = build_data_cv()

loading data...

print (type(revs), len(revs),type(vocab),len(vocab))

print (revs[0])

{'y': 0, 'text': 'parsing and modification of sql statements in java', 'split': 0, 'num_words': 8}

小结一： revs是存数据的列表—（252313条数据），vocab是词表–字典（40441个单词）

print ("loding word2vec vectors...")w2v = load_bin_vec(w2v_file, vocab)print ("word2vec loaded!")print ("num words already in word2vec: " + str(len(w2v)))w2v = add_unknown_words(w2v, vocab)W, word_idx_map = get_W(w2v)print ("dataset creaded!")

loding word2vec vectors...word2vec loaded!num words already in word2vec: 28520dataset creaded!

type(w2v) #word embeddings

dict

len(w2v)

print (type(W),len(W))print (W.shape)


   
     40442(40442L, 400L)

小结二：词向量是字典–w2v 转换成np.ndarray–W, W前面补一个向量是因为，句子存在补零，那个向量对应补零位

texts = df.texttokenizer = Tokenizer(nb_words=len(vocab))tokenizer.fit_on_texts(texts)sequences =tokenizer.texts_to_sequences(texts)word_index = tokenizer.word_indexprint ("Found %s unique tokens." % len(word_index))data = pad_sequences(sequences, maxlen = MAX_SEQUENCE_LENGTH)labels = df.ylabels = to_categorical(np.asarray(labels))print ("Shape of the data tensor:", data.shape)print("Shape of the label tensor:", labels.shape)

Found 40435 unique tokens.Shape of the data tensor: (252313L, 25L)Shape of the label tensor: (252313L, 2064L)

print (type(data),len(data), type(labels))

print(type(data[0]), len(data[0]), data[0].shape)print(data[0])print(data[1])print(data[2]) # keras's pad_sequences 补零在前？？


   
     25 (25L,)[   0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0  385    8 2985    6   26  727    2   27][   0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0   13  234    1 4315    1 5683][   0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0   20    9    5  432  168   78  124  385 1913]

小结三： data是pad完的文本，np.ndarray（252313条，长度25），labels是data对应的标签(252313,2064)

x_train = data[:248300]y_train = labels[:248300]x_test = data[248300:]y_test = labels[248300:]

#lod pre-trained word embeddings into an Embedding layer#note that we set trainable = True so as to fine tune the embeddings embedding_layer = Embedding(len(vocab) + 1,                            EMBEDDING_DIM,                            weights=[W],                            input_length=MAX_SEQUENCE_LENGTH,                            trainable=True)

小结四：vocab是词表——字典，加一是因为pad补零。EMBEDDING_DIM是词向量维度400。weights是词向量矩阵（40442,400）。输入长度25.

print ("Training model.")model = Sequential()model.add(embedding_layer)model.add(Convolution1D(nb_filter=NB_FILTER,                        filter_length=FILTER_LENGTH,                        border_mode='valid',                        activation='relu',                        subsample_length=1))# use max pooling:model.add(MaxPooling1D(pool_length=model.output_shape[1]))# We flatten the output of the conv layer,model.add(Dropout(0.5))model.add(Activation('relu'))# so that we can add a vanilla dense layer:model.add(Flatten())# We add a vanilla hidden layer:model.add(Dense(128, activation='relu'))# We project onto a single unit output layer, and squash it with a sigmoid:model.add(Dense(len(labels[0]), activation='softmax'))model.summary()

Training model.____________________________________________________________________________________________________Layer (type)                     Output Shape          Param #     Connected to                     ====================================================================================================embedding_1 (Embedding)          (None, 25, 400)       16176800    embedding_input_1[0][0]          ____________________________________________________________________________________________________convolution1d_1 (Convolution1D)  (None, 21, 128)       256128      embedding_1[0][0]                ____________________________________________________________________________________________________maxpooling1d_1 (MaxPooling1D)    (None, 1, 128)        0           convolution1d_1[0][0]            ____________________________________________________________________________________________________dropout_1 (Dropout)              (None, 1, 128)        0           maxpooling1d_1[0][0]             ____________________________________________________________________________________________________activation_1 (Activation)        (None, 1, 128)        0           dropout_1[0][0]                  ____________________________________________________________________________________________________flatten_1 (Flatten)              (None, 128)           0           activation_1[0][0]               ____________________________________________________________________________________________________dense_1 (Dense)                  (None, 128)           16512       flatten_1[0][0]                  ____________________________________________________________________________________________________dense_2 (Dense)                  (None, 2064)          266256      dense_1[0][0]                    ====================================================================================================Total params: 16,715,696Trainable params: 16,715,696Non-trainable params: 0____________________________________________________________________________________________________

小结五：参数计算。

embedding层：40442 * 400 = 16176800

convolotion层：128 * 5 * 400 + 128 = 256128

倒数第二层，全连接层：128 * 2064 + 2064 = 266256

倒数第一层，softmax层： 128 * 128 + 128 = 16512

model.compile(loss='categorical_crossentropy',              optimizer='adam',              metrics=['acc'])model.fit(x_train, y_train, validation_data=(x_test, y_test),          nb_epoch=20, batch_size=128)

转载地址：http://ajoji.baihongyu.com/

你可能感兴趣的文章

eclips 中maketplace下安装maven方法

查看>>

搭建Nexus + Maven环境

查看>>

Eclipse 关闭 updating indexes

查看>>