介绍
基于上个baseline,这次新加的逻辑是越近期的交易越重要:
- 如果最近1周有交易记录,则推荐最近一周买过的商品
- 否则,如果最近2周有交易记录,则推荐最近2周买过的商品
- 否则,如果最近3周有交易记录,则推荐最近3周买过的商品
- 否则,推荐最火的12个商品
代码
导入包
import numpy as np
import pandas as pd
from pathlib import Path
data_path = Path('/kaggle/input/h-and-m-personalized-fashion-recommendations/')
交易数据集
transactions = pd.read_csv(
data_path / 'transactions_train.csv',
# set dtype or pandas will drop the leading '0' and convert to int
dtype={'article_id': str}
)
submission = pd.read_csv(data_path / 'sample_submission.csv')
取出日期
transactions['t_dat'] = pd.to_datetime(transactions['t_dat'])
分别取出3周,2周,1周的交易数据和高销量数据
purchase_dict_3w = {}
for i,x in enumerate(zip(transactions_3w['customer_id'], transactions_3w['article_id'])):
cust_id, art_id = x
if cust_id not in purchase_dict_3w:
purchase_dict_3w[cust_id] = {}
if art_id not in purchase_dict_3w[cust_id]:
purchase_dict_3w[cust_id][art_id] = 0
purchase_dict_3w[cust_id][art_id] += 1
print(len(purchase_dict_3w))
dummy_list_3w = list((transactions_3w['article_id'].value_counts()).index)[:12]
purchase_dict_2w = {}
for i,x in enumerate(zip(transactions_2w['customer_id'], transactions_2w['article_id'])):
cust_id, art_id = x
if cust_id not in purchase_dict_2w:
purchase_dict_2w[cust_id] = {}
if art_id not in purchase_dict_2w[cust_id]:
purchase_dict_2w[cust_id][art_id] = 0
purchase_dict_2w[cust_id][art_id] += 1
print(len(purchase_dict_2w))
dummy_list_2w = list((transactions_2w['article_id'].value_counts()).index)[:12]
purchase_dict_1w = {}
for i,x in enumerate(zip(transactions_1w['customer_id'], transactions_1w['article_id'])):
cust_id, art_id = x
if cust_id not in purchase_dict_1w:
purchase_dict_1w[cust_id] = {}
if art_id not in purchase_dict_1w[cust_id]:
purchase_dict_1w[cust_id][art_id] = 0
purchase_dict_1w[cust_id][art_id] += 1
print(len(purchase_dict_1w))
dummy_list_1w = list((transactions_1w['article_id'].value_counts()).index)[:12]
遍历1,2,3周的数据
not_so_fancy_but_fast_benchmark = submission[['customer_id']]
prediction_list = []
dummy_list = list((transactions_1w['article_id'].value_counts()).index)[:12]
dummy_pred = ' '.join(dummy_list)
for i, cust_id in enumerate(submission['customer_id'].values.reshape((-1,))):
if cust_id in purchase_dict_1w:
l = sorted((purchase_dict_1w[cust_id]).items(), key=lambda x: x[1], reverse=True)
l = [y[0] for y in l]
if len(l)>12:
s = ' '.join(l[:12])
else:
s = ' '.join(l+dummy_list_1w[:(12-len(l))])
elif cust_id in purchase_dict_2w:
l = sorted((purchase_dict_2w[cust_id]).items(), key=lambda x: x[1], reverse=True)
l = [y[0] for y in l]
if len(l)>12:
s = ' '.join(l[:12])
else:
s = ' '.join(l+dummy_list_2w[:(12-len(l))])
elif cust_id in purchase_dict_3w:
l = sorted((purchase_dict_3w[cust_id]).items(), key=lambda x: x[1], reverse=True)
l = [y[0] for y in l]
if len(l)>12:
s = ' '.join(l[:12])
else:
s = ' '.join(l+dummy_list_3w[:(12-len(l))])
else:
s = dummy_pred
prediction_list.append(s)
not_so_fancy_but_fast_benchmark['prediction'] = prediction_list
print(not_so_fancy_but_fast_benchmark.shape)
not_so_fancy_but_fast_benchmark.head()
提交
not_so_fancy_but_fast_benchmark.to_csv('not_so_fancy_but_fast_benchmark.csv', index=False)