本文共 10909 字,大约阅读时间需要 36 分钟。
'''This script loads pre-trained word embeddings(word2vec embeddings)into a Keras Embedding layer, and uses it to train a text classification model on a customized dataset.'''from __future__ import print_functionfrom collections import defaultdictimport osimport numpy as npimport pandas as pdnp.random.seed(1337)from keras.preprocessing.text import Tokenizerfrom keras.preprocessing.sequence import pad_sequencesfrom keras.utils.np_utils import to_categoricalfrom keras.layers import Dense, Input, Flattenfrom keras.layers import Conv1D, MaxPooling1D, Embeddingfrom keras.layers import Convolution1D, Dropout, Activation from keras.models import Sequentialfrom keras.models import Modelw2v_file = 'G:/pre_trained word embeddings/word2vec/vectors.bin'train_data = './cqa_title/traindata/userprofilepythontitle.txt'test_data = './cqa_title/testdata/TestQuestionsPythonTitle.txt'EMBEDDING_DIM = 400MAX_SEQUENCE_LENGTH = 25NB_FILTER = 128FILTER_LENGTH = 5def build_data_cv(clean_string=False): """ Loads data. """ revs = [] vocab = defaultdict(float) with open(train_data, "r") as f: for line in f: rev = [] rev.append(line.strip()) if clean_string: orig_rev = clean_str(" ".join(rev)) else: orig_rev = " ".join(rev).lower() words = orig_rev.split() n_y = int(words[0]) for word in words[1:]: vocab[word] += 1 datum = { "y":n_y-1, "text": " ".join(words[1:]), "num_words": len(words)-1, "split": 0} revs.append(datum) with open(test_data, "r") as f: for line in f: rev = [] rev.append(line.strip()) if clean_string: orig_rev = clean_str(" ".join(rev)) else: orig_rev = " ".join(rev).lower() words = orig_rev.split() n_y = int(words[0]) for word in words[1:]: vocab[word] += 1 datum = { "y":n_y-1, "text": " ".join(words[1:]), "num_words": len(words)-1, "split": 1} revs.append(datum) return revs, vocabdef clean_str(string, TREC=False): """ Tokenization/string cleaning for all datasets except for SST. Every dataset is lower cased except for TREC """ string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) string = re.sub(r"\'s", " \'s", string) string = re.sub(r"\'ve", " \'ve", string) string = re.sub(r"n\'t", " n\'t", string) string = re.sub(r"\'re", " \'re", string) string = re.sub(r"\'d", " \'d", string) string = re.sub(r"\'ll", " \'ll", string) string = re.sub(r",", " , ", string) string = re.sub(r"!", " ! ", string) string = re.sub(r"\(", " \( ", string) string = re.sub(r"\)", " \) ", string) string = re.sub(r"\?", " \? ", string) string = re.sub(r"\s{2,}", " ", string) return string.strip() if TREC else string.strip().lower()def load_bin_vec(fname, vocab): """ Loads 300x1 word vecs from Google (Mikolov) word2vec """ word_vecs = {} with open(fname, "rb") as f: header = f.readline() vocab_size, layer1_size = map(int, header.split()) binary_len = np.dtype('float32').itemsize * layer1_size for line in xrange(vocab_size): word = [] while True: ch = f.read(1) if ch == ' ': word = ''.join(word) break if ch != '\n': word.append(ch) if word in vocab: word_vecs[word] = np.fromstring(f.read(binary_len), dtype='float32') else: f.read(binary_len) return word_vecsdef add_unknown_words(word_vecs, vocab, min_df=1, k=EMBEDDING_DIM): """ For words that occur in at least min_df documents, create a separate word vector. 0.25 is chosen so the unknown vectors have (approximately) same variance as pre-trained ones """ for word in vocab: if word not in word_vecs and vocab[word] >= min_df: word_vecs[word] = np.random.uniform(-0.25,0.25,k) return word_vecsdef get_W(word_vecs, k=EMBEDDING_DIM): """ Get word matrix. W[i] is the vector for word indexed by i """ vocab_size = len(word_vecs) word_idx_map = dict() W = np.zeros(shape=(vocab_size+1, k), dtype='float32') W[0] = np.zeros(k, dtype='float32') i = 1 for word in word_vecs: W[i] = word_vecs[word] word_idx_map[word] = i i += 1 return W, word_idx_map
Using Theano backend.
print ("loading data...")revs, vocab = build_data_cv()df = pd.DataFrame(revs)max_l = np.max(df)["num_words"]print ("data loaded!")print ("number of sentences: " + str(len(revs)))print ("vocab size: " + str(len(vocab)))print ("max sentence length: " + str(max_l))
loading data...data loaded!number of sentences: 252313vocab size: 40441max sentence length: 37
print ("loading data...")revs, vocab = build_data_cv()
loading data...
print (type(revs), len(revs),type(vocab),len(vocab))
252313 40441
print (revs[0])
{'y': 0, 'text': 'parsing and modification of sql statements in java', 'split': 0, 'num_words': 8}
print ("loding word2vec vectors...")w2v = load_bin_vec(w2v_file, vocab)print ("word2vec loaded!")print ("num words already in word2vec: " + str(len(w2v)))w2v = add_unknown_words(w2v, vocab)W, word_idx_map = get_W(w2v)print ("dataset creaded!")
loding word2vec vectors...word2vec loaded!num words already in word2vec: 28520dataset creaded!
type(w2v) #word embeddings
dict
len(w2v)
40441
print (type(W),len(W))print (W.shape)
40442(40442L, 400L)
texts = df.texttokenizer = Tokenizer(nb_words=len(vocab))tokenizer.fit_on_texts(texts)sequences =tokenizer.texts_to_sequences(texts)word_index = tokenizer.word_indexprint ("Found %s unique tokens." % len(word_index))data = pad_sequences(sequences, maxlen = MAX_SEQUENCE_LENGTH)labels = df.ylabels = to_categorical(np.asarray(labels))print ("Shape of the data tensor:", data.shape)print("Shape of the label tensor:", labels.shape)
Found 40435 unique tokens.Shape of the data tensor: (252313L, 25L)Shape of the label tensor: (252313L, 2064L)
print (type(data),len(data), type(labels))
252313
print(type(data[0]), len(data[0]), data[0].shape)print(data[0])print(data[1])print(data[2]) # keras's pad_sequences 补零在前??
25 (25L,)[ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 385 8 2985 6 26 727 2 27][ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 13 234 1 4315 1 5683][ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 20 9 5 432 168 78 124 385 1913]
x_train = data[:248300]y_train = labels[:248300]x_test = data[248300:]y_test = labels[248300:]
#lod pre-trained word embeddings into an Embedding layer#note that we set trainable = True so as to fine tune the embeddings embedding_layer = Embedding(len(vocab) + 1, EMBEDDING_DIM, weights=[W], input_length=MAX_SEQUENCE_LENGTH, trainable=True)
print ("Training model.")model = Sequential()model.add(embedding_layer)model.add(Convolution1D(nb_filter=NB_FILTER, filter_length=FILTER_LENGTH, border_mode='valid', activation='relu', subsample_length=1))# use max pooling:model.add(MaxPooling1D(pool_length=model.output_shape[1]))# We flatten the output of the conv layer,model.add(Dropout(0.5))model.add(Activation('relu'))# so that we can add a vanilla dense layer:model.add(Flatten())# We add a vanilla hidden layer:model.add(Dense(128, activation='relu'))# We project onto a single unit output layer, and squash it with a sigmoid:model.add(Dense(len(labels[0]), activation='softmax'))model.summary()
Training model.____________________________________________________________________________________________________Layer (type) Output Shape Param # Connected to ====================================================================================================embedding_1 (Embedding) (None, 25, 400) 16176800 embedding_input_1[0][0] ____________________________________________________________________________________________________convolution1d_1 (Convolution1D) (None, 21, 128) 256128 embedding_1[0][0] ____________________________________________________________________________________________________maxpooling1d_1 (MaxPooling1D) (None, 1, 128) 0 convolution1d_1[0][0] ____________________________________________________________________________________________________dropout_1 (Dropout) (None, 1, 128) 0 maxpooling1d_1[0][0] ____________________________________________________________________________________________________activation_1 (Activation) (None, 1, 128) 0 dropout_1[0][0] ____________________________________________________________________________________________________flatten_1 (Flatten) (None, 128) 0 activation_1[0][0] ____________________________________________________________________________________________________dense_1 (Dense) (None, 128) 16512 flatten_1[0][0] ____________________________________________________________________________________________________dense_2 (Dense) (None, 2064) 266256 dense_1[0][0] ====================================================================================================Total params: 16,715,696Trainable params: 16,715,696Non-trainable params: 0____________________________________________________________________________________________________
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])model.fit(x_train, y_train, validation_data=(x_test, y_test), nb_epoch=20, batch_size=128)
转载地址:http://ajoji.baihongyu.com/