Content-based recommendation system using neural networks

Content-based recommendation system using neural networks - Part 1

15 Jan 2019

This post builds the data we will use for creating our content based model. We’ll collect the data via a collection of SQL queries from a Google Analytics dataset in BigQuery. Our goal is to recommend an article for a visitor (Client ID) to our site.

In this notebook we will illustrates

how to pull data from BigQuery for our recommendation system
how to create train and test sets

import os
import tensorflow as tf
import numpy as np
import google.datalab.bigquery as bq

PROJECT = 'PROJECT' # REPLACE WITH YOUR PROJECT ID
BUCKET = 'BUCKET' # REPLACE WITH YOUR BUCKET NAME
REGION = 'us-central1' # REPLACE WITH YOUR BUCKET REGION e.g. us-central1

# do not change these
os.environ['PROJECT'] = PROJECT
os.environ['BUCKET'] = BUCKET
os.environ['REGION'] = REGION
os.environ['TFVERSION'] = '1.8'

gcloud  config  set project $PROJECT
gcloud config set compute/region $REGION

We will use this helper function to write lists containing article ids, categories, and authors for each article in our database to local file.

def write_list_to_disk(my_list, filename):
  with open(filename, 'w') as f:
    for item in my_list:
        line = "%s\n" % item

1. Pull data from BigQuery

a) content_ids.txt

The cell below creates a local text file containing all the article ids (i.e. ‘content ids’) in the dataset which is basically a custom dimension in Google Analytics.

import google.datalab.bigquery as bq
sql="""
#standardSQL
SELECT
   (SELECT MAX(IF(index=<YOUR INDEX>, value, NULL)) FROM UNNEST(hits.customDimensions)) AS content_id
FROM `<PROJECT><DATASET><TABLE>`,   
  UNNEST(hits) AS hits
WHERE 
  # only include hits on pages
  hits.type = "PAGE"
  AND hits.page.pagePath IS NOT NULL
    AND
        hits.type = "PAGE"
      AND
      fullVisitorId IS NOT NULL
      AND
      hits.time != 0
      AND
      hits.time IS NOT NULL
  AND hits.page.pagePath NOT LIKE '%,%'
  AND hits.page.pagePath NOT LIKE '%?%'
  AND hits.page.pagePath != '/'
  AND REGEXP_EXTRACT((SELECT MAX(IF(index=<YOUR INDEX>, value, NULL)) FROM UNNEST(hits.customDimensions)), r"^[^,|\/|;]+") NOT LIKE '%;%'
  AND REGEXP_EXTRACT((SELECT MAX(IF(index=<YOUR INDEX>, value, NULL)) FROM UNNEST(hits.customDimensions)), r"^[^,|\/|;]+") NOT LIKE '%&nbsp%'
  AND hits.page.pagePath LIKE '%2019%'
GROUP BY
  content_id
"""
content_ids_list = bq.Query(sql).execute().result().to_dataframe()['content_id'].tolist()
write_list_to_disk(content_ids_list, "content_ids.txt")
print("Some sample content IDs {}".format(content_ids_list[:3]))
print("The total number of articles is {}".format(len(content_ids_list)))

b) categories.txt

The cell below creates a local text file containing all categories which are a custom dimension in Google Analytics.

sql="""
SELECT  
  (SELECT MAX(IF(index=<YOUR INDEX>, value, NULL)) FROM UNNEST(hits.customDimensions)) AS category 
FROM `<PROJECT><DATASET><TABLE>`,   
  UNNEST(hits) AS hits
WHERE 
  # only include hits on pages
  hits.type = "PAGE"
  AND (SELECT MAX(IF(index=<YOUR INDEX>, value, NULL)) FROM UNNEST(hits.customDimensions)) IS NOT NULL
    AND
        hits.type = "PAGE"
      AND
      fullVisitorId IS NOT NULL
      AND
      hits.time != 0
      AND
      hits.time IS NOT NULL
  AND hits.page.pagePath IS NOT NULL
  AND hits.page.pagePath NOT LIKE '%,%'
  AND hits.page.pagePath NOT LIKE '%?%'
  AND hits.page.pagePath != '/'
  AND REGEXP_EXTRACT((SELECT MAX(IF(index=<YOUR INDEX>, value, NULL)) FROM UNNEST(hits.customDimensions)), r"^[^,|\/|;]+") NOT LIKE '%;%'
  AND REGEXP_EXTRACT((SELECT MAX(IF(index=<YOUR INDEX>, value, NULL)) FROM UNNEST(hits.customDimensions)), r"^[^,|\/|;]+") NOT LIKE '%&nbsp%'
  AND hits.page.pagePath LIKE '%2019%'
GROUP BY
  category
"""
categories_list = bq.Query(sql).execute().result().to_dataframe()['category'].tolist()
write_list_to_disk(categories_list, "categories.txt")
print(categories_list)

c) authors_list.txt

The cell below creates a local text file containing all authors which are in a custom dimension too.

sql="""
#standardSQL
SELECT
  REGEXP_EXTRACT((SELECT MAX(IF(index=<YOUR INDEX>, value, NULL)) FROM UNNEST(hits.customDimensions)), r"^[^,|\/|;]+")  AS first_author  
FROM `<PROJECT><DATASET><TABLE>`,   
  UNNEST(hits) AS hits
WHERE 
  # only include hits on pages
  hits.type = "PAGE"
  AND (SELECT MAX(IF(index=<YOUR INDEX>, value, NULL)) FROM UNNEST(hits.customDimensions)) IS NOT NULL
  AND
        hits.type = "PAGE"
      AND
      fullVisitorId IS NOT NULL
      AND
      hits.time != 0
      AND
      hits.time IS NOT NULL
  AND hits.page.pagePath IS NOT NULL
  AND hits.page.pagePath NOT LIKE '%,%'
  AND hits.page.pagePath NOT LIKE '%?%'
  AND hits.page.pagePath != '/'
  AND REGEXP_EXTRACT((SELECT MAX(IF(index=<YOUR INDEX>, value, NULL)) FROM UNNEST(hits.customDimensions)), r"^[^,|\/|;]+") NOT LIKE '%;%'
  AND REGEXP_EXTRACT((SELECT MAX(IF(index=<YOUR INDEX>, value, NULL)) FROM UNNEST(hits.customDimensions)), r"^[^,|\/|;]+") NOT LIKE '%&nbsp%'
  AND hits.page.pagePath LIKE '%2019%'
GROUP BY   
  first_author
"""
authors_list = bq.Query(sql).execute().result().to_dataframe()['first_author'].tolist()
write_list_to_disk(authors_list, "authors.txt")
print("Some sample authors {}".format(authors_list[:10]))
print("The total number of authors is {}".format(len(authors_list)))

d) medium

The cell below creates a local text file containing all values for medium dimension in Google Analytics.

sql="""
#standardSQL
SELECT
  trafficSource.medium  AS medium
FROM `<PROJECT><DATASET><TABLE>`,   
  UNNEST(hits) AS hits
WHERE 
  # only include hits on pages
  hits.type = "PAGE"
  AND (SELECT MAX(IF(<YOUR INDEX>, value, NULL)) FROM UNNEST(hits.customDimensions)) IS NOT NULL
  AND
        hits.type = "PAGE"
      AND
      fullVisitorId IS NOT NULL
      AND
      hits.time != 0
      AND
      hits.time IS NOT NULL
  AND hits.page.pagePath IS NOT NULL
  AND hits.page.pagePath NOT LIKE '%,%'
  AND hits.page.pagePath NOT LIKE '%?%'
  AND hits.page.pagePath != '/'
  AND REGEXP_EXTRACT((SELECT MAX(IF(<YOUR INDEX>, value, NULL)) FROM UNNEST(hits.customDimensions)), r"^[^,|\/|;]+") NOT LIKE '%;%'
  AND REGEXP_EXTRACT((SELECT MAX(IF(<YOUR INDEX>, value, NULL)) FROM UNNEST(hits.customDimensions)), r"^[^,|\/|;]+") NOT LIKE '%&nbsp%'
  AND hits.page.pagePath LIKE '%2019%'
GROUP BY   
  medium
"""
medium_list = bq.Query(sql).execute().result().to_dataframe()['medium'].tolist()
write_list_to_disk(medium_list, "medium.txt")
print("Some sample authors {}".format(medium_list[:10]))
print("The total number of authors is {}".format(len(medium_list)))

2. Create train and test sets.

Training set

In this section, we will create the train/test sdatasets for training and test our model. We will concatenate values for visitor_id (fullVisitorId in Google Analytics) and content_id to create a farm fingerprint, taking 90% of the data for the training set. This request will return, for each visitor_id, the content_id, category, author, medium but also the next_content_id.

sql="""
WITH site_history as (
SELECT
      fullVisitorId as visitor_id,
      (SELECT MAX(IF(index=<YOUR INDEX>, value, NULL)) FROM UNNEST(hits.customDimensions)) AS content_id,
      (SELECT MAX(IF(index=<YOUR INDEX>, value, NULL)) FROM UNNEST(hits.customDimensions)) AS title,
      (SELECT MAX(IF(index=<YOUR INDEX>, value, NULL)) FROM UNNEST(hits.customDimensions)) AS author_list,
      SPLIT(RPAD((SELECT MAX(IF(index=<YOUR INDEX>, value, NULL)) FROM UNNEST(hits.customDimensions)), 7), '-') as year_month_array,
      LEAD((SELECT MAX(IF(index=<YOUR INDEX>, value, NULL)) FROM UNNEST(hits.customDimensions)), 1) OVER (PARTITION BY fullVisitorId ORDER BY hits.time ASC) as nextCustomDimensions,
      trafficSource.medium AS medium,
  FROM 
    `<PROJECT><DATASET><TABLE>`,   
     UNNEST(hits) AS hits
   WHERE 
     # only include hits on pages
      hits.type = "PAGE"
      AND
      fullVisitorId IS NOT NULL
      AND
      hits.time != 0
      AND
      hits.time IS NOT NULL
      AND
      hits.page.pagePath IS NOT NULL
      AND
      hits.page.pagePath NOT LIKE '%,%'
     AND hits.page.pagePath NOT LIKE '%?%'
     AND hits.page.pagePath != '/'
      AND
      SPLIT(RPAD((SELECT MAX(IF(index=<YOUR INDEX>, value, NULL)) FROM UNNEST(hits.customDimensions)), 7), '-') IS NOT NULL
      AND
      (SELECT MAX(IF(index=<YOUR INDEX>, value, NULL)) FROM UNNEST(hits.customDimensions)) IS NOT NULL
      AND
      (SELECT MAX(IF(index=<YOUR INDEX>, value, NULL)) FROM UNNEST(hits.customDimensions)) IS NOT NULL

  AND REGEXP_EXTRACT((SELECT MAX(IF(index=<YOUR INDEX>, value, NULL)) FROM UNNEST(hits.customDimensions)), r"^[^,|\/|;]+") NOT LIKE '%;%'
  AND REGEXP_EXTRACT((SELECT MAX(IF(index=<YOUR INDEX>, value, NULL)) FROM UNNEST(hits.customDimensions)), r"^[^,|\/|;]+") NOT LIKE '%&nbsp%'
  AND hits.page.pagePath LIKE '%2019%'
      
)
SELECT
  visitor_id,
  content_id,
  medium,
  REGEXP_REPLACE(title, r",", "") as title,
  REGEXP_EXTRACT(author_list, r"^[^,]+") as author,
  DATE_DIFF(DATE(CAST(year_month_array[OFFSET(0)] AS INT64), CAST(year_month_array[OFFSET(1)] AS INT64), 1), DATE(1970,1,1), MONTH) as months_since_epoch,
  nextCustomDimensions as next_content_id
FROM
  site_history
WHERE nextCustomDimensions IS NOT NULL AND
MOD(ABS(FARM_FINGERPRINT(CONCAT(visitor_id, content_id))), 10 ) < 9

"""

training_set_df = bq.Query(sql).execute().result().to_dataframe()
training_set_df.to_csv('training_set.csv', header=False, index=False, encoding='utf-8')
training_set_df.head()

Test set

We will repeat the query as above but change outcome of the farm fingerprint hash to collect the remaining 10% of the data for the test set.

sql="""
WITH site_history as (
SELECT
      fullVisitorId as visitor_id,
      (SELECT MAX(IF(index=<YOUR INDEX>, value, NULL)) FROM UNNEST(hits.customDimensions)) AS content_id,
      (SELECT MAX(IF(index=<YOUR INDEX>, value, NULL)) FROM UNNEST(hits.customDimensions)) AS title,
      (SELECT MAX(IF(index=<YOUR INDEX>, value, NULL)) FROM UNNEST(hits.customDimensions)) AS author_list,
      SPLIT(RPAD((SELECT MAX(IF(index=<YOUR INDEX>, value, NULL)) FROM UNNEST(hits.customDimensions)), 7), '-') as year_month_array,
      LEAD((SELECT MAX(IF(index=<YOUR INDEX>, value, NULL)) FROM UNNEST(hits.customDimensions)), 1) OVER (PARTITION BY fullVisitorId ORDER BY hits.time ASC) as nextCustomDimensions,  
      trafficSource.medium AS medium,
  FROM 
    `<PROJECT><DATASET><TABLE>`,   
     UNNEST(hits) AS hits
   WHERE 
     # only include hits on pages
      hits.type = "PAGE"
      AND
      fullVisitorId IS NOT NULL
      AND
      hits.time != 0
      AND
      hits.time IS NOT NULL
      AND
      hits.page.pagePath IS NOT NULL
      AND
      hits.page.pagePath NOT LIKE '%,%'
      AND hits.page.pagePath NOT LIKE '%?%'
      AND hits.page.pagePath != '/'
      AND
      SPLIT(RPAD((SELECT MAX(IF(index=<YOUR INDEX>, value, NULL)) FROM UNNEST(hits.customDimensions)), 7), '-') IS NOT NULL
      AND
      (SELECT MAX(IF(index=<YOUR INDEX>, value, NULL)) FROM UNNEST(hits.customDimensions)) IS NOT NULL
      AND
      (SELECT MAX(IF(index=<YOUR INDEX>, value, NULL)) FROM UNNEST(hits.customDimensions)) IS NOT NULL
      
  AND REGEXP_EXTRACT((SELECT MAX(IF(index=11, value, NULL)) FROM UNNEST(hits.customDimensions)), r"^[^,|\/|;]+") NOT LIKE '%;%'
  AND REGEXP_EXTRACT((SELECT MAX(IF(index=11, value, NULL)) FROM UNNEST(hits.customDimensions)), r"^[^,|\/|;]+") NOT LIKE '%&nbsp%'
  AND hits.page.pagePath LIKE '%2019%'
)
SELECT
  visitor_id,
  content_id,
  medium,
  REGEXP_REPLACE(title, r",", "") as title,
  REGEXP_EXTRACT(author_list, r"^[^,]+") as author,
  DATE_DIFF(DATE(CAST(year_month_array[OFFSET(0)] AS INT64), CAST(year_month_array[OFFSET(1)] AS INT64), 1), DATE(1970,1,1), MONTH) as months_since_epoch,
  nextCustomDimensions as next_content_id
FROM
  site_history
WHERE nextCustomDimensions IS NOT NULL
AND MOD(ABS(FARM_FINGERPRINT(CONCAT(visitor_id, content_id))), 10) >= 9
"""

test_set_df = bq.Query(sql).execute().result().to_dataframe()
test_set_df.to_csv('test_set.csv', header=False, index=False, encoding='utf-8')

Louis Willems