1+ from hyperparams import Hyperparams as hp
2+ import tensorflow as tf
3+ import numpy as np
4+ import codecs
5+ import regex
6+
7+ def load_de_vocab ():
8+ vocab = [line .split ()[0 ] for line in codecs .open ('data/de.vocab.tsv' ,'r' ,'utf-8' ).read ().splitlines ()
9+ if int (line .split ()[1 ])>= hp .min_cnt ]
10+ word2idx = {word :idx for idx ,word in enumerate (vocab )}
11+ idx2word = {idx :word for idx ,word in enumerate (vocab )}
12+
13+ return word2idx ,idx2word
14+
15+ def load_en_vocab ():
16+ vocab = [line .split ()[0 ] for line in codecs .open ('data/en.vocab.tsv' ,'r' ,'utf-8' ).read ().splitlines ()
17+ if int (line .split ()[1 ])>= hp .min_cnt ]
18+
19+ word2idx = {word :idx for idx ,word in enumerate (vocab )}
20+ idx2word = {idx :word for idx ,word in enumerate (vocab )}
21+ return word2idx ,idx2word
22+
23+
24+
25+ def create_data (source_sents ,target_sents ):
26+ de2idx ,idx2de = load_de_vocab ()
27+ en2idx ,idx2en = load_en_vocab ()
28+
29+ x_list ,y_list ,Sources ,Targets = [],[],[],[]
30+ for source_sent ,target_sent in zip (source_sents ,target_sents ):
31+ x = [de2idx .get (word ,1 ) for word in (source_sent + u" </S>" ).split ()] # 1: OOV, </S>: End of Text
32+ y = [en2idx .get (word ,1 ) for word in (target_sent + u" </S>" ).split ()]
33+
34+ if max (len (x ),len (y )) <= hp .maxlen :
35+ x_list .append (np .array (x ))
36+ y_list .append (np .array (y ))
37+ Sources .append (source_sent )
38+ Targets .append (target_sent )
39+
40+ #Pad
41+ X = np .zeros ([len (x_list ),hp .maxlen ],np .int32 )
42+ Y = np .zeros ([len (y_list ),hp .maxlen ],np .int32 )
43+
44+ for i ,(x ,y ) in enumerate (zip (x_list ,y_list )):
45+ X [i ] = np .lib .pad (x ,[0 ,hp .maxlen - len (x )],'constant' ,constant_values = (0 ,0 ))
46+ Y [i ] = np .lib .pad (y ,[0 ,hp .maxlen - len (y )],'constant' ,constant_values = (0 ,0 ))
47+ return X ,Y ,Sources ,Targets
48+
49+
50+
51+ def load_train_data ():
52+ def _refine (line ):
53+ line = regex .sub ("[^\s\p{Latin}']" , "" , line )
54+ return line .strip ()
55+
56+ de_sents = [_refine (line ) for line in codecs .open (hp .source_train , 'r' , 'utf-8' ).read ().split ('\n ' ) if
57+ line and line [0 ] != "<" ]
58+ en_sents = [_refine (line ) for line in codecs .open (hp .target_train , 'r' , 'utf-8' ).read ().split ('\n ' ) if
59+ line and line [0 ] != '<' ]
60+
61+ X , Y , Sources , Targets = create_data (de_sents , en_sents )
62+ return X , Y
63+
64+
65+ def load_test_data ():
66+ def _refine (line ):
67+ line = regex .sub ("<[^>]+>" , "" , line )
68+ line = regex .sub ("[^\s\p{Latin}']" , "" , line )
69+ return line .strip ()
70+
71+ de_sents = [_refine (line ) for line in codecs .open (hp .source_test ,'r' ,'utf-8' ).read ().split ('\n ' ) if line and line [:4 ] == "<seg" ]
72+ en_sents = [_refine (line ) for line in codecs .open (hp .target_test ,'r' ,'utf-8' ).read ().split ('\n ' ) if line and line [:4 ] == '<seg' ]
73+
74+ X ,Y ,Sources ,Targets = create_data (de_sents ,en_sents )
75+ return X ,Sources ,Targets
76+
77+
78+
79+ def get_batch_data ():
80+ X ,Y = load_train_data ()
81+
82+ num_batch = len (X ) // hp .batch_size
83+
84+ print (X [:10 ],Y [:10 ])
85+ X = tf .convert_to_tensor (X ,tf .int32 )
86+ Y = tf .convert_to_tensor (Y ,tf .int32 )
87+
88+ input_queues = tf .train .slice_input_producer ([X ,Y ])
89+
90+ x ,y = tf .train .shuffle_batch (input_queues ,
91+ num_threads = 8 ,
92+ batch_size = hp .batch_size ,
93+ capacity = hp .batch_size * 64 ,
94+ min_after_dequeue = hp .batch_size * 32 ,
95+ allow_smaller_final_batch = False )
96+
97+ return x ,y ,num_batch
0 commit comments