Content-based recommendation system using neural networks - Part 1
15 Jan 2019
This post builds the data we will use for creating our content based model. We’ll collect the data via a collection of SQL queries from a Google Analytics dataset in BigQuery.
Our goal is to recommend an article for a visitor (Client ID) to our site.
In this notebook we will illustrates
- how to pull data from BigQuery for our recommendation system
- how to create train and test sets
import os
import tensorflow as tf
import numpy as np
import google.datalab.bigquery as bq
PROJECT = 'PROJECT' # REPLACE WITH YOUR PROJECT ID
BUCKET = 'BUCKET' # REPLACE WITH YOUR BUCKET NAME
REGION = 'us-central1' # REPLACE WITH YOUR BUCKET REGION e.g. us-central1
# do not change these
os.environ['PROJECT'] = PROJECT
os.environ['BUCKET'] = BUCKET
os.environ['REGION'] = REGION
os.environ['TFVERSION'] = '1.8'
gcloud config set project $PROJECT
gcloud config set compute/region $REGION
We will use this helper function to write lists containing article ids, categories, and authors for each article in our database to local file.
def write_list_to_disk(my_list, filename):
with open(filename, 'w') as f:
for item in my_list:
line = "%s\n" % item
1. Pull data from BigQuery
a) content_ids.txt
The cell below creates a local text file containing all the article ids (i.e. ‘content ids’) in the dataset which is basically a custom dimension in Google Analytics.
import google.datalab.bigquery as bq
sql="""
#standardSQL
SELECT
(SELECT MAX(IF(index=<YOUR INDEX>, value, NULL)) FROM UNNEST(hits.customDimensions)) AS content_id
FROM `<PROJECT><DATASET><TABLE>`,
UNNEST(hits) AS hits
WHERE
# only include hits on pages
hits.type = "PAGE"
AND hits.page.pagePath IS NOT NULL
AND
hits.type = "PAGE"
AND
fullVisitorId IS NOT NULL
AND
hits.time != 0
AND
hits.time IS NOT NULL
AND hits.page.pagePath NOT LIKE '%,%'
AND hits.page.pagePath NOT LIKE '%?%'
AND hits.page.pagePath != '/'
AND REGEXP_EXTRACT((SELECT MAX(IF(index=<YOUR INDEX>, value, NULL)) FROM UNNEST(hits.customDimensions)), r"^[^,|\/|;]+") NOT LIKE '%;%'
AND REGEXP_EXTRACT((SELECT MAX(IF(index=<YOUR INDEX>, value, NULL)) FROM UNNEST(hits.customDimensions)), r"^[^,|\/|;]+") NOT LIKE '% %'
AND hits.page.pagePath LIKE '%2019%'
GROUP BY
content_id
"""
content_ids_list = bq.Query(sql).execute().result().to_dataframe()['content_id'].tolist()
write_list_to_disk(content_ids_list, "content_ids.txt")
print("Some sample content IDs {}".format(content_ids_list[:3]))
print("The total number of articles is {}".format(len(content_ids_list)))
b) categories.txt
The cell below creates a local text file containing all categories which are a custom dimension in Google Analytics.
sql="""
SELECT
(SELECT MAX(IF(index=<YOUR INDEX>, value, NULL)) FROM UNNEST(hits.customDimensions)) AS category
FROM `<PROJECT><DATASET><TABLE>`,
UNNEST(hits) AS hits
WHERE
# only include hits on pages
hits.type = "PAGE"
AND (SELECT MAX(IF(index=<YOUR INDEX>, value, NULL)) FROM UNNEST(hits.customDimensions)) IS NOT NULL
AND
hits.type = "PAGE"
AND
fullVisitorId IS NOT NULL
AND
hits.time != 0
AND
hits.time IS NOT NULL
AND hits.page.pagePath IS NOT NULL
AND hits.page.pagePath NOT LIKE '%,%'
AND hits.page.pagePath NOT LIKE '%?%'
AND hits.page.pagePath != '/'
AND REGEXP_EXTRACT((SELECT MAX(IF(index=<YOUR INDEX>, value, NULL)) FROM UNNEST(hits.customDimensions)), r"^[^,|\/|;]+") NOT LIKE '%;%'
AND REGEXP_EXTRACT((SELECT MAX(IF(index=<YOUR INDEX>, value, NULL)) FROM UNNEST(hits.customDimensions)), r"^[^,|\/|;]+") NOT LIKE '% %'
AND hits.page.pagePath LIKE '%2019%'
GROUP BY
category
"""
categories_list = bq.Query(sql).execute().result().to_dataframe()['category'].tolist()
write_list_to_disk(categories_list, "categories.txt")
print(categories_list)
c) authors_list.txt
The cell below creates a local text file containing all authors which are in a custom dimension too.
sql="""
#standardSQL
SELECT
REGEXP_EXTRACT((SELECT MAX(IF(index=<YOUR INDEX>, value, NULL)) FROM UNNEST(hits.customDimensions)), r"^[^,|\/|;]+") AS first_author
FROM `<PROJECT><DATASET><TABLE>`,
UNNEST(hits) AS hits
WHERE
# only include hits on pages
hits.type = "PAGE"
AND (SELECT MAX(IF(index=<YOUR INDEX>, value, NULL)) FROM UNNEST(hits.customDimensions)) IS NOT NULL
AND
hits.type = "PAGE"
AND
fullVisitorId IS NOT NULL
AND
hits.time != 0
AND
hits.time IS NOT NULL
AND hits.page.pagePath IS NOT NULL
AND hits.page.pagePath NOT LIKE '%,%'
AND hits.page.pagePath NOT LIKE '%?%'
AND hits.page.pagePath != '/'
AND REGEXP_EXTRACT((SELECT MAX(IF(index=<YOUR INDEX>, value, NULL)) FROM UNNEST(hits.customDimensions)), r"^[^,|\/|;]+") NOT LIKE '%;%'
AND REGEXP_EXTRACT((SELECT MAX(IF(index=<YOUR INDEX>, value, NULL)) FROM UNNEST(hits.customDimensions)), r"^[^,|\/|;]+") NOT LIKE '% %'
AND hits.page.pagePath LIKE '%2019%'
GROUP BY
first_author
"""
authors_list = bq.Query(sql).execute().result().to_dataframe()['first_author'].tolist()
write_list_to_disk(authors_list, "authors.txt")
print("Some sample authors {}".format(authors_list[:10]))
print("The total number of authors is {}".format(len(authors_list)))
d) medium
The cell below creates a local text file containing all values for medium dimension in Google Analytics.
sql="""
#standardSQL
SELECT
trafficSource.medium AS medium
FROM `<PROJECT><DATASET><TABLE>`,
UNNEST(hits) AS hits
WHERE
# only include hits on pages
hits.type = "PAGE"
AND (SELECT MAX(IF(<YOUR INDEX>, value, NULL)) FROM UNNEST(hits.customDimensions)) IS NOT NULL
AND
hits.type = "PAGE"
AND
fullVisitorId IS NOT NULL
AND
hits.time != 0
AND
hits.time IS NOT NULL
AND hits.page.pagePath IS NOT NULL
AND hits.page.pagePath NOT LIKE '%,%'
AND hits.page.pagePath NOT LIKE '%?%'
AND hits.page.pagePath != '/'
AND REGEXP_EXTRACT((SELECT MAX(IF(<YOUR INDEX>, value, NULL)) FROM UNNEST(hits.customDimensions)), r"^[^,|\/|;]+") NOT LIKE '%;%'
AND REGEXP_EXTRACT((SELECT MAX(IF(<YOUR INDEX>, value, NULL)) FROM UNNEST(hits.customDimensions)), r"^[^,|\/|;]+") NOT LIKE '% %'
AND hits.page.pagePath LIKE '%2019%'
GROUP BY
medium
"""
medium_list = bq.Query(sql).execute().result().to_dataframe()['medium'].tolist()
write_list_to_disk(medium_list, "medium.txt")
print("Some sample authors {}".format(medium_list[:10]))
print("The total number of authors is {}".format(len(medium_list)))
2. Create train and test sets.
Training set
In this section, we will create the train/test sdatasets for training and test our model. We will concatenate values for visitor_id (fullVisitorId in Google Analytics) and content_id to create a farm fingerprint, taking 90% of the data for the training set. This request will return, for each visitor_id, the content_id, category, author, medium but also the next_content_id.
sql="""
WITH site_history as (
SELECT
fullVisitorId as visitor_id,
(SELECT MAX(IF(index=<YOUR INDEX>, value, NULL)) FROM UNNEST(hits.customDimensions)) AS content_id,
(SELECT MAX(IF(index=<YOUR INDEX>, value, NULL)) FROM UNNEST(hits.customDimensions)) AS title,
(SELECT MAX(IF(index=<YOUR INDEX>, value, NULL)) FROM UNNEST(hits.customDimensions)) AS author_list,
SPLIT(RPAD((SELECT MAX(IF(index=<YOUR INDEX>, value, NULL)) FROM UNNEST(hits.customDimensions)), 7), '-') as year_month_array,
LEAD((SELECT MAX(IF(index=<YOUR INDEX>, value, NULL)) FROM UNNEST(hits.customDimensions)), 1) OVER (PARTITION BY fullVisitorId ORDER BY hits.time ASC) as nextCustomDimensions,
trafficSource.medium AS medium,
FROM
`<PROJECT><DATASET><TABLE>`,
UNNEST(hits) AS hits
WHERE
# only include hits on pages
hits.type = "PAGE"
AND
fullVisitorId IS NOT NULL
AND
hits.time != 0
AND
hits.time IS NOT NULL
AND
hits.page.pagePath IS NOT NULL
AND
hits.page.pagePath NOT LIKE '%,%'
AND hits.page.pagePath NOT LIKE '%?%'
AND hits.page.pagePath != '/'
AND
SPLIT(RPAD((SELECT MAX(IF(index=<YOUR INDEX>, value, NULL)) FROM UNNEST(hits.customDimensions)), 7), '-') IS NOT NULL
AND
(SELECT MAX(IF(index=<YOUR INDEX>, value, NULL)) FROM UNNEST(hits.customDimensions)) IS NOT NULL
AND
(SELECT MAX(IF(index=<YOUR INDEX>, value, NULL)) FROM UNNEST(hits.customDimensions)) IS NOT NULL
AND REGEXP_EXTRACT((SELECT MAX(IF(index=<YOUR INDEX>, value, NULL)) FROM UNNEST(hits.customDimensions)), r"^[^,|\/|;]+") NOT LIKE '%;%'
AND REGEXP_EXTRACT((SELECT MAX(IF(index=<YOUR INDEX>, value, NULL)) FROM UNNEST(hits.customDimensions)), r"^[^,|\/|;]+") NOT LIKE '% %'
AND hits.page.pagePath LIKE '%2019%'
)
SELECT
visitor_id,
content_id,
medium,
REGEXP_REPLACE(title, r",", "") as title,
REGEXP_EXTRACT(author_list, r"^[^,]+") as author,
DATE_DIFF(DATE(CAST(year_month_array[OFFSET(0)] AS INT64), CAST(year_month_array[OFFSET(1)] AS INT64), 1), DATE(1970,1,1), MONTH) as months_since_epoch,
nextCustomDimensions as next_content_id
FROM
site_history
WHERE nextCustomDimensions IS NOT NULL AND
MOD(ABS(FARM_FINGERPRINT(CONCAT(visitor_id, content_id))), 10 ) < 9
"""
training_set_df = bq.Query(sql).execute().result().to_dataframe()
training_set_df.to_csv('training_set.csv', header=False, index=False, encoding='utf-8')
training_set_df.head()
Test set
We will repeat the query as above but change outcome of the farm fingerprint hash to collect the remaining 10% of the data for the test set.
sql="""
WITH site_history as (
SELECT
fullVisitorId as visitor_id,
(SELECT MAX(IF(index=<YOUR INDEX>, value, NULL)) FROM UNNEST(hits.customDimensions)) AS content_id,
(SELECT MAX(IF(index=<YOUR INDEX>, value, NULL)) FROM UNNEST(hits.customDimensions)) AS title,
(SELECT MAX(IF(index=<YOUR INDEX>, value, NULL)) FROM UNNEST(hits.customDimensions)) AS author_list,
SPLIT(RPAD((SELECT MAX(IF(index=<YOUR INDEX>, value, NULL)) FROM UNNEST(hits.customDimensions)), 7), '-') as year_month_array,
LEAD((SELECT MAX(IF(index=<YOUR INDEX>, value, NULL)) FROM UNNEST(hits.customDimensions)), 1) OVER (PARTITION BY fullVisitorId ORDER BY hits.time ASC) as nextCustomDimensions,
trafficSource.medium AS medium,
FROM
`<PROJECT><DATASET><TABLE>`,
UNNEST(hits) AS hits
WHERE
# only include hits on pages
hits.type = "PAGE"
AND
fullVisitorId IS NOT NULL
AND
hits.time != 0
AND
hits.time IS NOT NULL
AND
hits.page.pagePath IS NOT NULL
AND
hits.page.pagePath NOT LIKE '%,%'
AND hits.page.pagePath NOT LIKE '%?%'
AND hits.page.pagePath != '/'
AND
SPLIT(RPAD((SELECT MAX(IF(index=<YOUR INDEX>, value, NULL)) FROM UNNEST(hits.customDimensions)), 7), '-') IS NOT NULL
AND
(SELECT MAX(IF(index=<YOUR INDEX>, value, NULL)) FROM UNNEST(hits.customDimensions)) IS NOT NULL
AND
(SELECT MAX(IF(index=<YOUR INDEX>, value, NULL)) FROM UNNEST(hits.customDimensions)) IS NOT NULL
AND REGEXP_EXTRACT((SELECT MAX(IF(index=11, value, NULL)) FROM UNNEST(hits.customDimensions)), r"^[^,|\/|;]+") NOT LIKE '%;%'
AND REGEXP_EXTRACT((SELECT MAX(IF(index=11, value, NULL)) FROM UNNEST(hits.customDimensions)), r"^[^,|\/|;]+") NOT LIKE '% %'
AND hits.page.pagePath LIKE '%2019%'
)
SELECT
visitor_id,
content_id,
medium,
REGEXP_REPLACE(title, r",", "") as title,
REGEXP_EXTRACT(author_list, r"^[^,]+") as author,
DATE_DIFF(DATE(CAST(year_month_array[OFFSET(0)] AS INT64), CAST(year_month_array[OFFSET(1)] AS INT64), 1), DATE(1970,1,1), MONTH) as months_since_epoch,
nextCustomDimensions as next_content_id
FROM
site_history
WHERE nextCustomDimensions IS NOT NULL
AND MOD(ABS(FARM_FINGERPRINT(CONCAT(visitor_id, content_id))), 10) >= 9
"""
test_set_df = bq.Query(sql).execute().result().to_dataframe()
test_set_df.to_csv('test_set.csv', header=False, index=False, encoding='utf-8')