Skip to content

Commit 14cdf91

Browse files
author
shixiaowen03
committed
NAIS
1 parent 58965de commit 14cdf91

File tree

7 files changed

+1145
-249
lines changed

7 files changed

+1145
-249
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,9 @@ test_data/
4949
result/
5050
export_ptr_model/
5151
data/
52+
Data/
53+
Log/
54+
log/
5255
__pycache__/
5356

5457
log/

.idea/workspace.xml

Lines changed: 229 additions & 249 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Lines changed: 185 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,185 @@
1+
import multiprocessing
2+
import numpy as np
3+
4+
_Dataset = None
5+
_batch_size = None
6+
_num_negatives = None
7+
_num_items = None
8+
_user_input = None
9+
_item_input = None
10+
_labels = None
11+
_index = None
12+
_num_batch = None
13+
_batch_length = None
14+
15+
16+
def shuffle(dataset, batch_choice, num_negatives): # negative sampling and shuffle the data
17+
18+
global _Dataset
19+
global _batch_size
20+
global _num_negatives
21+
global _num_items
22+
global _user_input
23+
global _item_input
24+
global _labels
25+
global _index
26+
global _num_batch
27+
global _batch_length
28+
_Dataset = dataset
29+
_num_negatives = num_negatives
30+
31+
if batch_choice == 'user':
32+
_num_items, _user_input, _item_input, _labels, _batch_length = _get_train_data_user()
33+
_num_batch = len(_batch_length)
34+
return _preprocess(_get_train_batch_user)
35+
36+
else:
37+
batch_choices = batch_choice.split(":")
38+
if batch_choices[0] == 'fixed':
39+
_batch_size = int(batch_choices[1])
40+
_num_items, _user_input, _item_input, _labels = _get_train_data_fixed()
41+
iterations = len(_user_input)
42+
_index = np.arange(iterations)
43+
_num_batch = iterations / _batch_size
44+
return _preprocess(_get_train_batch_fixed)
45+
else:
46+
print("invalid batch size !")
47+
48+
49+
def batch_gen(batches, i):
50+
return [(batches[r])[i] for r in range(4)]
51+
52+
53+
def _preprocess(get_train_batch): # generate the masked batch list
54+
user_input_list, num_idx_list, item_input_list, labels_list = [], [], [], []
55+
cpu_count = multiprocessing.cpu_count()
56+
if cpu_count == 1:
57+
for i in range(_num_batch):
58+
ui, ni, ii, l = get_train_batch(i)
59+
user_input_list.append(ui)
60+
num_idx_list.append(ni)
61+
item_input_list.append(ii)
62+
labels_list.append(l)
63+
else:
64+
pool = multiprocessing.Pool(cpu_count)
65+
res = pool.map(get_train_batch, list(range(_num_batch)))
66+
pool.close()
67+
pool.join()
68+
user_input_list = [r[0] for r in res]
69+
num_idx_list = [r[1] for r in res]
70+
item_input_list = [r[2] for r in res]
71+
labels_list = [r[3] for r in res]
72+
return (user_input_list, num_idx_list, item_input_list, labels_list)
73+
74+
75+
def _get_train_data_user():
76+
user_input, item_input, labels, batch_length = [], [], [], []
77+
train = _Dataset.trainMatrix
78+
trainList = _Dataset.trainList
79+
num_items = train.shape[1]
80+
num_users = train.shape[0]
81+
for u in range(num_users):
82+
if u == 0:
83+
batch_length.append((1 + _num_negatives) * len(trainList[u]))
84+
else:
85+
batch_length.append((1 + _num_negatives) * len(trainList[u]) + batch_length[u - 1])
86+
for i in trainList[u]:
87+
# positive instance
88+
user_input.append(u)
89+
item_input.append(i)
90+
labels.append(1)
91+
# negative instances
92+
for t in range(_num_negatives):
93+
j = np.random.randint(num_items)
94+
while j in trainList[u]:
95+
j = np.random.randint(num_items)
96+
user_input.append(u)
97+
item_input.append(j)
98+
labels.append(0)
99+
return num_items, user_input, item_input, labels, batch_length
100+
101+
102+
def _get_train_batch_user(i):
103+
# represent the feature of users via items rated by him/her
104+
user_list, num_list, item_list, labels_list = [], [], [], []
105+
trainList = _Dataset.trainList
106+
if i == 0:
107+
begin = 0
108+
else:
109+
begin = _batch_length[i - 1]
110+
batch_index = list(range(begin, _batch_length[i]))
111+
np.random.shuffle(batch_index)
112+
for idx in batch_index:
113+
user_idx = _user_input[idx]
114+
item_idx = _item_input[idx]
115+
nonzero_row = []
116+
nonzero_row += trainList[user_idx]
117+
num_list.append(_remove_item(_num_items, nonzero_row, item_idx))
118+
user_list.append(nonzero_row)
119+
item_list.append(item_idx)
120+
labels_list.append(_labels[idx])
121+
user_input = np.array(_add_mask(_num_items, user_list, max(num_list)))
122+
num_idx = np.array(num_list)
123+
item_input = np.array(item_list)
124+
labels = np.array(labels_list)
125+
return (user_input, num_idx, item_input, labels)
126+
127+
128+
def _get_train_data_fixed():
129+
user_input, item_input, labels = [], [], []
130+
train = _Dataset.trainMatrix
131+
num_items = train.shape[1]
132+
for (u, i) in train.keys():
133+
# positive instance
134+
user_items = []
135+
user_input.append(u)
136+
item_input.append(i)
137+
labels.append(1)
138+
# negative instances
139+
for t in range(_num_negatives):
140+
j = np.random.randint(num_items)
141+
while train.has_key((u, j)):
142+
j = np.random.randint(num_items)
143+
user_input.append(u)
144+
item_input.append(j)
145+
labels.append(0)
146+
return num_items, user_input, item_input, labels
147+
148+
149+
def _get_train_batch_fixed(i):
150+
# represent the feature of users via items rated by him/her
151+
user_list, num_list, item_list, labels_list = [], [], [], []
152+
trainList = _Dataset.trainList
153+
begin = i * _batch_size
154+
for idx in range(begin, begin + _batch_size):
155+
user_idx = _user_input[_index[idx]]
156+
item_idx = _item_input[_index[idx]]
157+
nonzero_row = []
158+
nonzero_row += trainList[user_idx]
159+
num_list.append(_remove_item(_num_items, nonzero_row, item_idx))
160+
user_list.append(nonzero_row)
161+
item_list.append(item_idx)
162+
labels_list.append(_labels[_index[idx]])
163+
user_input = np.array(_add_mask(_num_items, user_list, max(num_list)))
164+
num_idx = np.array(num_list)
165+
item_input = np.array(item_list)
166+
labels = np.array(labels_list)
167+
return (user_input, num_idx, item_input, labels)
168+
169+
170+
def _remove_item(feature_mask, users, item):
171+
flag = 0
172+
for i in range(len(users)):
173+
if users[i] == item:
174+
users[i] = users[-1]
175+
users[-1] = feature_mask
176+
flag = 1
177+
break
178+
return len(users) - flag
179+
180+
181+
def _add_mask(feature_mask, features, num_max):
182+
# uniformalize the length of each batch
183+
for i in range(len(features)):
184+
features[i] = features[i] + [feature_mask] * (num_max + 1 - len(features[i]))
185+
return features
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
import scipy.sparse as sp
2+
import numpy as np
3+
4+
ITEM_CLIP = 300
5+
6+
class Dataset(object):
7+
8+
def __init__(self,path):
9+
self.trainMatrix = self.load_training_file_as_matrix(path + ".train.rating")
10+
self.trainList = self.load_training_file_as_list(path + ".train.rating")
11+
self.testRatings = self.load_rating_file_as_list(path + '.test.rating')
12+
self.testNegatives = self.load_negative_file(path + ".test.negative")
13+
assert len(self.testRatings) == len(self.testNegatives)
14+
self.num_users, self.num_items = self.trainMatrix.shape
15+
16+
17+
18+
def load_negative_file(self,filename):
19+
negativeList = []
20+
with open(filename, "r") as f:
21+
line = f.readline()
22+
while line != None and line != "": # 一行是一个用户所有的neg
23+
arr = line.split("\t")
24+
negatives = []
25+
for x in arr[1:]:
26+
negatives.append(int(x))
27+
negativeList.append(negatives)
28+
line = f.readline()
29+
return negativeList
30+
31+
def load_rating_file_as_list(self,filename):
32+
ratingList = []
33+
with open(filename, "r") as f:
34+
line = f.readline()
35+
while line != None and line != "":
36+
arr = line.split("\t")
37+
user, item = int(arr[0]), int(arr[1])
38+
ratingList.append([user, item])
39+
line = f.readline()
40+
return ratingList
41+
42+
43+
44+
def load_training_file_as_list(self,filename):
45+
u_ = 0
46+
lists, items = [], [] # 训练数据是按用户id排序过的
47+
with open(filename, "r") as f:
48+
line = f.readline()
49+
index = 0
50+
while line != None and line != "":
51+
arr = line.split("\t")
52+
u, i = int(arr[0]), int(arr[1])
53+
if u_ < u:
54+
index = 0
55+
lists.append(items) # 每次的items是一个用户所有打过分的item
56+
items = []
57+
u_ += 1
58+
index += 1
59+
if index < ITEM_CLIP:
60+
items.append(i)
61+
line = f.readline()
62+
lists.append(items)
63+
print("already load the trainList...")
64+
return lists
65+
66+
67+
68+
69+
def load_training_file_as_matrix(self,filename):
70+
71+
num_users,num_items = 0,0
72+
with open(filename,"r") as f:
73+
line = f.readline()
74+
while line != None and line != "":
75+
arr = line.split("\t")
76+
77+
u,i = int(arr[0]),int(arr[1])
78+
79+
num_users = max(num_users,u)
80+
num_items = max(num_items,i)
81+
line = f.readline()
82+
83+
84+
mat = sp.dok_matrix((num_users+1,num_items+1),dtype=np.float32)
85+
with open(filename,"r") as f:
86+
line = f.readline()
87+
while line != None and line != "":
88+
arr = line.split("\t")
89+
user,item,rating = int(arr[0]),int(arr[1]),float(arr[2])
90+
91+
if rating > 0:
92+
mat[user,item] = 1.0
93+
line = f.readline()
94+
95+
print("already load the trainMatrix...")
96+
return mat

0 commit comments

Comments
 (0)