Skip to content

Commit 38d0d0c

Browse files
author
shixiaowen03
committed
session-based recommend rnn
1 parent aaae8c8 commit 38d0d0c

File tree

6 files changed

+444
-96
lines changed

6 files changed

+444
-96
lines changed

.idea/workspace.xml

Lines changed: 110 additions & 90 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
git:https://github.com/Songweiping/GRU4Rec_TensorFlow
2+
3+
paper:Session-based Recommendations With Recurrent Neural Networks
4+
5+
site:http://arxiv.org/abs/1511.06939
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
Created on Feb 27 2017
4+
Author: Weiping Song
5+
"""
6+
import numpy as np
7+
import pandas as pd
8+
9+
10+
def evaluate_sessions_batch(model, train_data, test_data, cut_off=20, batch_size=50, session_key='SessionId',
11+
item_key='ItemId', time_key='Time'):
12+
'''
13+
Evaluates the GRU4Rec network wrt. recommendation accuracy measured by recall@N and MRR@N.
14+
15+
Parameters
16+
--------
17+
model : A trained GRU4Rec model.
18+
train_data : It contains the transactions of the train set. In evaluation phrase, this is used to build item-to-id map.
19+
test_data : It contains the transactions of the test set. It has one column for session IDs, one for item IDs and one for the timestamp of the events (unix timestamps).
20+
cut-off : int
21+
Cut-off value (i.e. the length of the recommendation list; N for recall@N and MRR@N). Defauld value is 20.
22+
batch_size : int
23+
Number of events bundled into a batch during evaluation. Speeds up evaluation. If it is set high, the memory consumption increases. Default value is 100.
24+
session_key : string
25+
Header of the session ID column in the input file (default: 'SessionId')
26+
item_key : string
27+
Header of the item ID column in the input file (default: 'ItemId')
28+
time_key : string
29+
Header of the timestamp column in the input file (default: 'Time')
30+
31+
Returns
32+
--------
33+
out : tuple
34+
(Recall@N, MRR@N)
35+
36+
'''
37+
model.predict = False
38+
# Build itemidmap from train data.
39+
itemids = train_data[item_key].unique()
40+
itemidmap = pd.Series(data=np.arange(len(itemids)), index=itemids)
41+
42+
test_data.sort([session_key, time_key], inplace=True)
43+
offset_sessions = np.zeros(test_data[session_key].nunique() + 1, dtype=np.int32)
44+
offset_sessions[1:] = test_data.groupby(session_key).size().cumsum()
45+
evalutation_point_count = 0
46+
mrr, recall = 0.0, 0.0
47+
if len(offset_sessions) - 1 < batch_size:
48+
batch_size = len(offset_sessions) - 1
49+
iters = np.arange(batch_size).astype(np.int32)
50+
maxiter = iters.max()
51+
start = offset_sessions[iters]
52+
end = offset_sessions[iters + 1]
53+
in_idx = np.zeros(batch_size, dtype=np.int32)
54+
np.random.seed(42)
55+
while True:
56+
valid_mask = iters >= 0
57+
if valid_mask.sum() == 0:
58+
break
59+
start_valid = start[valid_mask]
60+
minlen = (end[valid_mask] - start_valid).min()
61+
in_idx[valid_mask] = test_data[item_key].values[start_valid]
62+
for i in range(minlen - 1):
63+
out_idx = test_data[item_key].values[start_valid + i + 1]
64+
preds = model.predict_next_batch(iters, in_idx, itemidmap, batch_size)
65+
preds.fillna(0, inplace=True)
66+
in_idx[valid_mask] = out_idx
67+
ranks = (preds.values.T[valid_mask].T > np.diag(preds.ix[in_idx].values)[valid_mask]).sum(axis=0) + 1
68+
rank_ok = ranks < cut_off
69+
recall += rank_ok.sum()
70+
mrr += (1.0 / ranks[rank_ok]).sum()
71+
evalutation_point_count += len(ranks)
72+
start = start + minlen - 1
73+
mask = np.arange(len(iters))[(valid_mask) & (end - start <= 1)]
74+
for idx in mask:
75+
maxiter += 1
76+
if maxiter >= len(offset_sessions) - 1:
77+
iters[idx] = -1
78+
else:
79+
iters[idx] = maxiter
80+
start[idx] = offset_sessions[maxiter]
81+
end[idx] = offset_sessions[maxiter + 1]
82+
return recall / evalutation_point_count, mrr / evalutation_point_count
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
import pandas as pd
2+
3+
import numpy as np
4+
5+
import random
6+
7+
train_data = np.zeros([3,10000],dtype=np.int32)
8+
9+
test_data = np.zeros([3,100],dtype=np.int32)
10+
11+
for i in range(10000):
12+
train_data[0,i] = random.randint(0,200)
13+
train_data[1,i] = random.randint(0,200)
14+
train_data[2,i] = random.randint(0,20000)
15+
16+
for i in range(100):
17+
test_data[0,i] = random.randint(0, 200)
18+
test_data[1,i] = random.randint(0, 200)
19+
test_data[2,i] = random.randint(0, 20000)
20+
21+
train_data = np.transpose(train_data)
22+
test_data = np.transpose(test_data)
23+
24+
25+
train_df = pd.DataFrame(train_data,columns=['SessionId','ItemId','Timestamps']).to_csv('data/train.csv')
26+
test_df = pd.DataFrame(test_data,columns=['SessionId','ItemId','Timestamps']).to_csv('data/test.csv')

recommendation/Basic-SessionBasedRNN-Demo/main.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@
99
import model
1010
import evaluation
1111

12-
train_data = ''
13-
test_data = ''
12+
train_data = 'data/train.csv'
13+
test_data = 'data/test.csv'
1414

1515
class Args():
1616
is_training = False
@@ -21,13 +21,13 @@ class Args():
2121
dropout_p_hidden = 1
2222
learning_rate = 0.001
2323
decay = 0.96
24-
decay_steps = 1e4
24+
decay_steps = 10
2525
sigma = 0
2626
init_as_normal = False
2727
reset_after_session = True
2828
session_key = 'SessionId'
2929
item_key = 'ItemId'
30-
time_key = 'Time'
30+
time_key = 'Timestamps'
3131
grad_cap = 0
3232
test_model = 2
3333
checkpoint_dir = './checkpoint'
@@ -41,7 +41,7 @@ def parseArgs():
4141
parser = argparse.ArgumentParser(description='GRU4Rec args')
4242
parser.add_argument('--layer', default=1, type=int)
4343
parser.add_argument('--size', default=100, type=int)
44-
parser.add_argument('--epoch', default=3, type=int)
44+
parser.add_argument('--epoch', default=300, type=int)
4545
parser.add_argument('--lr', default=0.001, type=float)
4646
parser.add_argument('--train', default=1, type=int)
4747
parser.add_argument('--test', default=2, type=int)

recommendation/Basic-SessionBasedRNN-Demo/model.py

Lines changed: 216 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ def __init__(self,sess,args):
5252
if args.final_act == 'linear':
5353
self.final_activation = self.linear
5454
elif args.final_act == 'relu':
55-
self.final_activatin = self.relu
55+
self.final_activation = self.relu
5656
else:
5757
self.final_activation = self.tanh
5858
self.loss_function = self.top1
@@ -109,4 +109,219 @@ def top1(self, yhat):
109109
term1 = tf.reduce_mean(tf.nn.sigmoid(-tf.diag_part(yhat)+yhatT)+tf.nn.sigmoid(yhatT**2), axis=0)
110110
term2 = tf.nn.sigmoid(tf.diag_part(yhat)**2) / self.batch_size
111111
return tf.reduce_mean(term1 - term2)
112+
113+
114+
def build_model(self):
115+
self.X = tf.placeholder(tf.int32,[self.batch_size],name='input')
116+
self.Y = tf.placeholder(tf.int32,[self.batch_size],name='output')
117+
self.state = [tf.placeholder(tf.float32,[self.batch_size,self.rnn_size],name='rnn_state') for _ in range(self.layers)]
118+
self.global_step = tf.Variable(0,name='global_step',trainable=False)
119+
120+
with tf.variable_scope('gru_layer'):
121+
sigma = self.sigma if self.sigma != 0 else np.sqrt(6.0 / (self.n_items + self.rnn_size))
122+
if self.init_as_normal:
123+
initializer = tf.random_normal_initializer(mean=0, stddev=sigma)
124+
else:
125+
initializer = tf.random_uniform_initializer(minval=-sigma, maxval=sigma)
126+
127+
embedding = tf.get_variable('embedding',[self.n_items,self.rnn_size],initializer=initializer)
128+
softmax_W = tf.get_variable('softmax_w',[self.n_items,self.rnn_size],initializer=initializer)
129+
softmax_b = tf.get_variable('softmax_b',[self.n_items],initializer=tf.constant_initializer(0.0))
130+
131+
132+
cell = rnn_cell.GRUCell(self.rnn_size,activation=self.hidden_act)
133+
drop_cell = rnn_cell.DropoutWrapper(cell,output_keep_prob=self.dropout_p_hidden)
134+
stacked_cell = rnn_cell.MultiRNNCell([drop_cell] * self.layers)
135+
136+
inputs = tf.nn.embedding_lookup(embedding,self.X)
137+
output,state = stacked_cell(inputs,tuple(self.state))
138+
self.final_state = state
139+
140+
141+
if self.is_training:
142+
sampled_W = tf.nn.embedding_lookup(softmax_W,self.Y)
143+
sampled_b = tf.nn.embedding_lookup(softmax_b,self.Y)
144+
145+
logits = tf.matmul(output,sampled_W,transpose_b=True) + sampled_b
146+
self.y_hat = self.final_activation(logits)
147+
self.cost = self.loss_function(self.y_hat)
148+
149+
else:
150+
logits = tf.matmul(output,softmax_W,transpose_b=True) + softmax_b
151+
self.y_hat = self.final_activation(logits)
152+
153+
154+
if not self.is_training:
155+
return
156+
157+
self.lr = tf.maximum(1e-5,
158+
tf.train.exponential_decay(self.learning_rate, self.global_step, self.decay_steps,
159+
self.decay, staircase=True))
160+
161+
optimizer = tf.train.AdamOptimizer(self.lr)
162+
tvars = tf.trainable_variables()
163+
gvs = optimizer.compute_gradients(self.cost, tvars)
164+
if self.grad_cap > 0:
165+
capped_gvs = [(tf.clip_by_norm(grad, self.grad_cap), var) for grad, var in gvs]
166+
else:
167+
capped_gvs = gvs
168+
self.train_op = optimizer.apply_gradients(capped_gvs, global_step=self.global_step)
169+
170+
171+
def init(self,data):
172+
data.sort_values([self.session_key,self.time_key],inplace=True)
173+
offset_sessions = np.zeros(data[self.session_key].nunique()+1,dtype=np.int32)
174+
offset_sessions[1:] = data.groupby(self.session_key).size().cumsum() # [0,25,38] 说明第一个session有25个,第二个session有13个
175+
return offset_sessions
176+
177+
178+
def fit(self,data):
179+
self.error_during_train = False
180+
181+
itemids = data[self.item_key].unique()
182+
self.n_items = len(itemids)
183+
self.itemidmap = pd.Series(data=np.arange(self.n_items),index=itemids)
184+
185+
data = pd.merge(data, pd.DataFrame({self.item_key: itemids, 'ItemIdx': self.itemidmap[itemids].values}),
186+
on=self.item_key, how='inner')
187+
offset_sessions = self.init(data)
188+
189+
print('fitting model...')
190+
191+
for epoch in range(self.n_epochs):
192+
epoch_cost = []
193+
state = [np.zeros([self.batch_size,self.rnn_size],dtype=np.float32) for _ in range(self.layers)]
194+
session_idx_arr = np.arange(len(offset_sessions)-1)
195+
iters = np.arange(self.batch_size)
196+
197+
maxiter = iters.max()
198+
start = offset_sessions[session_idx_arr[iters]]
199+
end = offset_sessions[session_idx_arr[iters] + 1]
200+
201+
finished = False
202+
while not finished:
203+
minlen = (end-start).min()
204+
out_idx = data.ItemIdx.values[start]
205+
for i in range(minlen-1):
206+
in_idx = out_idx
207+
out_idx = data.ItemIdx.values[start+i+1]
208+
fetches = [self.cost,self.final_state,self.global_step,self.lr,self.train_op]
209+
feed_dict = {self.X:in_idx,self.Y:out_idx}
210+
211+
for j in range(self.layers):
212+
feed_dict[self.state[j]] = state[j]
213+
214+
cost,state,step,lr,_ = self.sess.run(fetches,feed_dict)
215+
216+
217+
epoch_cost.append(cost)
218+
if np.isnan(cost):
219+
print(str(epoch) + ':Nan error!')
220+
self.error_during_train = True
221+
return
222+
if step == 1 or step % self.decay_steps == 0:
223+
avgc = np.mean(epoch_cost)
224+
print('Epoch {}\tStep {}\tlr: {:.6f}\tloss: {:.6f}'.format(epoch, step, lr, avgc))
225+
226+
start = start + minlen - 1
227+
mask = np.arange(len(iters))[(end-start) <= 1] # 哪些是已经结束的
228+
229+
for idx in mask:
230+
maxiter += 1
231+
if maxiter >= len(offset_sessions) - 1:
232+
print("epoch finish")
233+
finished = True
234+
break
235+
# 用下一个session的数据接力
236+
iters[idx] = maxiter
237+
start[idx] = offset_sessions[session_idx_arr[maxiter]]
238+
end[idx] = offset_sessions[session_idx_arr[maxiter] + 1]
239+
240+
if len(mask) and self.reset_after_session:
241+
for i in range(self.layers):
242+
state[i][mask] = 0
243+
244+
avgc = np.mean(epoch_cost)
245+
if np.isnan(avgc):
246+
print('Epoch {}: Nan error!'.format(epoch, avgc))
247+
self.error_during_train = True
248+
return
249+
self.saver.save(self.sess, '{}/gru-model'.format(self.checkpoint_dir), global_step=epoch)
250+
251+
def predict_next_batch(self, session_ids, input_item_ids, itemidmap, batch=50):
252+
'''
253+
Gives predicton scores for a selected set of items. Can be used in batch mode to predict for multiple independent events (i.e. events of different sessions) at once and thus speed up evaluation.
254+
255+
If the session ID at a given coordinate of the session_ids parameter remains the same during subsequent calls of the function, the corresponding hidden state of the network will be kept intact (i.e. that's how one can predict an item to a session).
256+
If it changes, the hidden state of the network is reset to zeros.
257+
258+
Parameters
259+
--------
260+
session_ids : 1D array
261+
Contains the session IDs of the events of the batch. Its length must equal to the prediction batch size (batch param).
262+
input_item_ids : 1D array
263+
Contains the item IDs of the events of the batch. Every item ID must be must be in the training data of the network. Its length must equal to the prediction batch size (batch param).
264+
batch : int
265+
Prediction batch size.
266+
267+
Returns
268+
--------
269+
out : pandas.DataFrame
270+
Prediction scores for selected items for every event of the batch.
271+
Columns: events of the batch; rows: items. Rows are indexed by the item IDs.
272+
273+
'''
274+
if batch != self.batch_size:
275+
raise Exception('Predict batch size({}) must match train batch size({})'.format(batch, self.batch_size))
276+
if not self.predict:
277+
self.current_session = np.ones(batch) * -1
278+
self.predict = True
279+
280+
session_change = np.arange(batch)[session_ids != self.current_session]
281+
if len(session_change) > 0: # change internal states with session changes
282+
for i in range(self.layers):
283+
self.predict_state[i][session_change] = 0.0
284+
self.current_session = session_ids.copy()
285+
286+
in_idxs = itemidmap[input_item_ids]
287+
fetches = [self.y_hat, self.final_state]
288+
feed_dict = {self.X: in_idxs}
289+
for i in range(self.layers):
290+
feed_dict[self.state[i]] = self.predict_state[i]
291+
preds, self.predict_state = self.sess.run(fetches, feed_dict)
292+
preds = np.asarray(preds).T
293+
return pd.DataFrame(data=preds, index=itemidmap.index)
294+
295+
296+
297+
298+
299+
300+
301+
302+
303+
304+
305+
306+
307+
308+
309+
310+
311+
312+
313+
314+
315+
316+
317+
318+
319+
320+
321+
322+
323+
324+
325+
326+
112327

0 commit comments

Comments
 (0)