code2015wang
diff --git a/‎.idea/workspace.xml‎
Lines changed: 216 additions & 266 deletions b/‎.idea/workspace.xml‎
Lines changed: 216 additions & 266 deletions
diff --git a/‎basic/Basic-Transformer-Demo/data_load.py‎
Lines changed: 97 additions & 0 deletions b/‎basic/Basic-Transformer-Demo/data_load.py‎
Lines changed: 97 additions & 0 deletions
diff --git a/‎basic/Basic-Transformer-Demo/data_path‎
Lines changed: 1 addition & 0 deletions b/‎basic/Basic-Transformer-Demo/data_path‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎basic/Basic-Transformer-Demo/eval.py‎
Lines changed: 78 additions & 0 deletions b/‎basic/Basic-Transformer-Demo/eval.py‎
Lines changed: 78 additions & 0 deletions
diff --git a/‎basic/Basic-Transformer-Demo/hyperparams.py‎
Lines changed: 24 additions & 0 deletions b/‎basic/Basic-Transformer-Demo/hyperparams.py‎
Lines changed: 24 additions & 0 deletions
@@ -0,0 +1,97 @@
+from hyperparams import Hyperparams as hp
+import tensorflow as tf
+import numpy as np
+import codecs
+import regex
+
+def load_de_vocab():
+    vocab = [line.split()[0] for line in codecs.open('data/de.vocab.tsv','r','utf-8').read().splitlines()
+             if int(line.split()[1])>=hp.min_cnt]
+    word2idx = {word:idx for idx,word in enumerate(vocab)}
+    idx2word = {idx:word for idx,word in enumerate(vocab)}
+
+    return word2idx,idx2word
+
+def load_en_vocab():
+    vocab = [line.split()[0] for line in codecs.open('data/en.vocab.tsv','r','utf-8').read().splitlines()
+             if int(line.split()[1])>=hp.min_cnt]
+
+    word2idx = {word:idx for idx,word in enumerate(vocab)}
+    idx2word = {idx:word for idx,word in enumerate(vocab)}
+    return word2idx,idx2word
+
+
+
+def create_data(source_sents,target_sents):
+    de2idx,idx2de = load_de_vocab()
+    en2idx,idx2en = load_en_vocab()
+
+    x_list ,y_list,Sources,Targets = [],[],[],[]
+    for source_sent,target_sent in zip(source_sents,target_sents):
+        x = [de2idx.get(word,1) for word in (source_sent+u" </S>").split()] # 1: OOV, </S>: End of Text
+        y = [en2idx.get(word,1) for word in (target_sent+u" </S>").split()]
+
+        if max(len(x),len(y)) <= hp.maxlen:
+            x_list.append(np.array(x))
+            y_list.append(np.array(y))
+            Sources.append(source_sent)
+            Targets.append(target_sent)
+
+    #Pad
+    X = np.zeros([len(x_list),hp.maxlen],np.int32)
+    Y = np.zeros([len(y_list),hp.maxlen],np.int32)
+
+    for i,(x,y) in enumerate(zip(x_list,y_list)):
+        X[i] = np.lib.pad(x,[0,hp.maxlen-len(x)],'constant',constant_values=(0,0))
+        Y[i] = np.lib.pad(y,[0,hp.maxlen-len(y)],'constant',constant_values=(0,0))
+    return X,Y,Sources,Targets
+
+
+
+def load_train_data():
+    def _refine(line):
+        line = regex.sub("[^\s\p{Latin}']", "", line)
+        return line.strip()
+
+    de_sents = [_refine(line) for line in codecs.open(hp.source_train, 'r', 'utf-8').read().split('\n') if
+                line and line[0] != "<"]
+    en_sents = [_refine(line) for line in codecs.open(hp.target_train, 'r', 'utf-8').read().split('\n') if
+                line and line[0] != '<']
+
+    X, Y, Sources, Targets = create_data(de_sents, en_sents)
+    return X, Y
+
+
+def load_test_data():
+    def _refine(line):
+        line = regex.sub("<[^>]+>", "", line)
+        line = regex.sub("[^\s\p{Latin}']", "", line)
+        return line.strip()
+
+    de_sents = [_refine(line) for line in codecs.open(hp.source_test,'r','utf-8').read().split('\n') if line and line[:4] == "<seg"]
+    en_sents = [_refine(line) for line in codecs.open(hp.target_test,'r','utf-8').read().split('\n') if line and line[:4] == '<seg']
+
+    X,Y,Sources,Targets = create_data(de_sents,en_sents)
+    return X,Sources,Targets
+
+
+
+def get_batch_data():
+    X,Y = load_train_data()
+
+    num_batch = len(X) // hp.batch_size
+
+    print(X[:10],Y[:10])
+    X = tf.convert_to_tensor(X,tf.int32)
+    Y = tf.convert_to_tensor(Y,tf.int32)
+
+    input_queues = tf.train.slice_input_producer([X,Y])
+
+    x,y = tf.train.shuffle_batch(input_queues,
+                                 num_threads=8,
+                                 batch_size=hp.batch_size,
+                                 capacity = hp.batch_size*64,
+                                 min_after_dequeue=hp.batch_size * 32,
+                                 allow_smaller_final_batch=False)
+
+    return x,y,num_batch
@@ -0,0 +1 @@
+数据地址为：https://pan.baidu.com/s/14XfprCqjmBKde9NmNZeCNg  密码:lfwu
@@ -0,0 +1,78 @@
+
+import codecs
+import os
+
+import tensorflow as tf
+import numpy as np
+
+from hyperparams import Hyperparams as hp
+from data_load import load_test_data, load_de_vocab, load_en_vocab
+from train import Graph
+from nltk.translate.bleu_score import corpus_bleu
+
+
+def eval():
+    # Load graph
+    g = Graph(is_training=False)
+    print("Graph loaded")
+
+    # Load data
+    X, Sources, Targets = load_test_data()
+    de2idx, idx2de = load_de_vocab()
+    en2idx, idx2en = load_en_vocab()
+
+    #     X, Sources, Targets = X[:33], Sources[:33], Targets[:33]
+
+    # Start session
+    with g.graph.as_default():
+        sv = tf.train.Supervisor()
+        with sv.managed_session(config=tf.ConfigProto(allow_soft_placement=True)) as sess:
+            ## Restore parameters
+            sv.saver.restore(sess, tf.train.latest_checkpoint(hp.logdir))
+            print("Restored!")
+
+            ## Get model name
+            mname = open(hp.logdir + '/checkpoint', 'r').read().split('"')[1]  # model name
+
+            ## Inference
+            if not os.path.exists('results'): os.mkdir('results')
+            with codecs.open("results/" + mname, "w", "utf-8") as fout:
+                list_of_refs, hypotheses = [], []
+                for i in range(len(X) // hp.batch_size):
+
+                    ### Get mini-batches
+                    x = X[i * hp.batch_size: (i + 1) * hp.batch_size]
+                    sources = Sources[i * hp.batch_size: (i + 1) * hp.batch_size]
+                    targets = Targets[i * hp.batch_size: (i + 1) * hp.batch_size]
+
+                    ### Autoregressive inference
+                    ### 在测试的时候是一个一个预测
+                    preds = np.zeros((hp.batch_size, hp.maxlen), np.int32)
+                    for j in range(hp.maxlen):
+                        _preds = sess.run(g.preds, {g.x: x, g.y: preds})
+                        preds[:, j] = _preds[:, j]
+
+                    ### Write to file
+                    for source, target, pred in zip(sources, targets, preds):  # sentence-wise
+                        got = " ".join(idx2en[idx] for idx in pred).split("</S>")[0].strip()
+                        fout.write("- source: " + source + "\n")
+                        fout.write("- expected: " + target + "\n")
+                        fout.write("- got: " + got + "\n\n")
+                        fout.flush()
+
+                        # bleu score
+                        ref = target.split()
+                        hypothesis = got.split()
+                        if len(ref) > 3 and len(hypothesis) > 3:
+                            list_of_refs.append([ref])
+                            hypotheses.append(hypothesis)
+
+                ## Calculate bleu score
+                score = corpus_bleu(list_of_refs, hypotheses)
+                fout.write("Bleu Score = " + str(100 * score))
+
+
+if __name__ == '__main__':
+    eval()
+    print("Done")
+
@@ -0,0 +1,24 @@
+class Hyperparams:
+    '''Hyperparameters'''
+    # data
+    source_train = 'data/train.tags.de-en.de'
+    target_train = 'data/train.tags.de-en.en'
+    source_test = 'data/IWSLT16.TED.tst2014.de-en.de.xml'
+    target_test = 'data/IWSLT16.TED.tst2014.de-en.en.xml'
+
+    # training
+    batch_size = 32  # alias = N
+    lr = 0.0001  # learning rate. In paper, learning rate is adjusted to the global step.
+    logdir = 'logdir'  # log directory
+
+    # model
+    maxlen = 10  # Maximum number of words in a sentence. alias = T.
+    # Feel free to increase this if you are ambitious.
+    min_cnt = 20  # words whose occurred less than min_cnt are encoded as <UNK>.
+    hidden_units = 512  # alias = C
+    num_blocks = 6  # number of encoder/decoder blocks
+    num_epochs = 20
+    num_heads = 8
+    dropout_rate = 0.1
+    sinusoid = False  # If True, use sinusoid. If false, positional embedding.
+
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+数据地址为：https://pan.baidu.com/s/14XfprCqjmBKde9NmNZeCNg 密码:lfwu`