Skip to content
Snippets Groups Projects
Commit df74e105 authored by usey11's avatar usey11
Browse files

Final changes

parent ea14ac6c
No related branches found
No related tags found
No related merge requests found
REQUIRES:
Python3
pandas
numpy
matplotlib
sklearn
run python with the run.py file to get started
"python -i .\run.py"
\ No newline at end of file
import pandas as pd import pandas as pd
from datetime import datetime from datetime import datetime
import numpy as np import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import NMF
from sktensor import dtensor
basic_categories = ["4d4b7105d754a06374d81259",
"4d4b7105d754a06379d81259",
"4d4b7105d754a06377d81259",
"4d4b7105d754a06378d81259",
"4d4b7104d754a06370d81259",
"4d4b7105d754a06375d81259",
"4d4b7105d754a06376d81259",
"4e67e38e036454776db1fb3a",
"4d4b7105d754a06372d81259",
"4d4b7105d754a06373d81259",
"52f2ab2ebcbc57f1066b8b52"]
category_names = {"4d4b7105d754a06374d81259": "Food",
"4d4b7105d754a06379d81259": "Travel/transport",
"4d4b7105d754a06377d81259": "Outdoors and Recreation",
"4d4b7105d754a06378d81259": "Shop and Service",
"4d4b7104d754a06370d81259": "Arts and Entertainment",
"4d4b7105d754a06375d81259": "Professional",
"4d4b7105d754a06376d81259": "Nightlife Spot",
"4e67e38e036454776db1fb3a": "Residence",
"4d4b7105d754a06372d81259": "College and University",
"4d4b7105d754a06373d81259": "Event",
"52f2ab2ebcbc57f1066b8b52": "Street"}
# ----Helper functions---->
# Get the number of checkins for each user
def get_checkin_counts(users, checkins): def get_checkin_counts(users, checkins):
return users.id.map(lambda id: len(checkins[checkins["userid"] == int(id)])) return users.id.map(lambda id: len(checkins[checkins["userid"] == int(id)]))
# Get all checkins form a particular user
def get_user_checkins(userid, checkins): def get_user_checkins(userid, checkins):
return checkins[checkins["userid"] == int(userid)] return checkins[checkins["userid"] == int(userid)]
# Get the count of the checkins that occurred at each hour
def get_hourly_checkins(checkins): def get_hourly_checkins(checkins):
# Get a list of checkins per hour
h = checkins.createdat.map(lambda t: datetime.fromtimestamp(t).hour).value_counts() h = checkins.createdat.map(lambda t: datetime.fromtimestamp(t).hour).value_counts()
hourly_checkins = []
missing = {} missing = {}
# For any hours that dont have checkins make a dict with 0
for i in range(0,24): for i in range(0,24):
if i not in h.index: if i not in h.index:
missing[i] = 0 missing[i] = 0
# Merge the lists and sort for right order
return h.append(pd.Series(missing)).sort_index().values return h.append(pd.Series(missing)).sort_index().values
def get_activity_matrix(users, checkins): # Get the count of the checkins from each category
def get_checkins_categories(checkins):
a = []
cat_counts = checkins.category.value_counts()
for cat in basic_categories:
if cat in cat_counts:
a.append(cat_counts[cat])
else:
a.append(0)
return a
# Get the uppermost category (climb the tree)
def get_category_head(venue, categories):
cat = categories[categories["id"] == venue]
if cat.index.size == 0:
return
elif (cat["level"] == "0").bool() or (cat["parentID"] == "\\N").bool():
return cat
else:
return get_category_head(cat.parentID.to_string(index = False), categories)
# <----Helper functions----
# Find and plot the temporal/spatial lifestyles, n=number of lifestyles to find
def plot_t_lifestyle(users, checkins, n=3):
model = NMF(n_components=n, init="nndsvd")
W = model.fit_transform(get_activity_matrix_t(users, checkins).T)
for i in range(0,n):
plt.plot(W.T[i] / max(W.T[i]))
H = model.components_
print("Avg err: ", model.reconstruction_err_ / users.shape[0])
def plot_s_lifestyle(users, checkins, n=3):
model = NMF(n_components=n, init="nndsvd")
W = model.fit_transform(get_activity_matrix_s(users, checkins).T)
for i in range(0,n):
plt.plot(W.T[i] / max(W.T[i]))
H = model.components_
print("Avg err: ", model.reconstruction_err_ / users.shape[0])
# Create Spatial and Temporal Activity Matrices
def get_activity_matrix_t(users, checkins):
user_count = len(users.index) user_count = len(users.index)
user_ids = users["id"].values user_ids = users["id"].values
a = np.zeros([user_count, 24]) a = np.zeros([user_count, 24])
for i in range(0, user_count): for i in range(0, user_count):
a[i] = get_hourly_checkins(get_user_checkins(user_ids[i], checkins)) a[i] = get_hourly_checkins(get_user_checkins(user_ids[i], checkins))
return a a = (a.T / a.max(1)).T
\ No newline at end of file a = a[~np.isnan(a).any(axis=1)]
return a
def get_activity_matrix_s(users, checkins):
user_count = users.index.size
user_ids = users["id"].values
a = np.zeros([user_count, len(basic_categories)])
for i in range(0, user_count):
a[i] = get_checkins_categories(get_user_checkins(user_ids[i], checkins))
a = (a.T / a.max(1)).T
a = a[~np.isnan(a).any(axis=1)]
return a
# Create User Spatial Temporal activity Matrices
# Hourly (11 x 24)
def get_activity_tensor_h(users, checkins):
cat_map = pd.Series({basic_categories[i]:i for i in range(0,len(basic_categories))})
T = np.zeros((users.index.size, 24, len(basic_categories)))
user_ids = users["id"].values
user_count = len(users.index)
for i in range(0, user_count):
user_checkins = get_user_checkins(user_ids[i], checkins)
for index, e in user_checkins.iterrows():
col = cat_map[e.category]
row = datetime.fromtimestamp(e.createdat).hour
T[i,row,col] += 1
return dtensor(T)
# Daily (11 x 7)
def get_activity_tensor_d(users, checkins):
cat_map = pd.Series({basic_categories[i]:i for i in range(0,len(basic_categories))})
T = np.zeros((users.index.size, 7, len(basic_categories)))
user_ids = users["id"].values
user_count = len(users.index)
for i in range(0, user_count):
user_checkins = get_user_checkins(user_ids[i], checkins)
for index, e in user_checkins.iterrows():
col = cat_map[e.category]
row = datetime.fromtimestamp(e.createdat).weekday()
T[i,row,col] += 1
return dtensor(T)
\ No newline at end of file
Source diff could not be displayed: it is too large. Options to address this: view the blob.
run.py 0 → 100644
import pandas as pd
import checkin
# Load in the data
checkins = pd.read_csv("data/checkins.csv")
users = pd.read_csv("data/users.csv")
venues = pd.read_csv("data/venues.csv")
categories = pd.read_csv("data/categories.csv")
venue_categories = pd.read_csv("data/venuecategoriesbasic.csv")
# Link the tables
checkins = checkins.join(venue_categories.set_index("venue"), on="venueid")
# Calculate the number of checkins for each user
users["count"] = checkin.get_checkin_counts(users, checkins)
# Get the users that have more than t checkins
def get_top_users(users, t):
if "count" not in users.columns:
raise ValueError("No user counts under 'count' header")
return users[users.count > t]
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment