Final changes

df74e105 · usey11 · ea14ac6c · df74e105 · df74e105 · df74e105
Commit df74e105 authored 6 years ago by usey11
--- a/README.md
+++ b/README.md
+REQUIRES:
+    Python3
+    pandas
+    numpy
+    matplotlib
+    sklearn
+run python with the run.py file to get started
+"python -i .\run.py"
\ No newline at end of file
--- a/checkin.py
+++ b/checkin.py
 import pandas as pd
 from datetime import datetime
 import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.decomposition import NMF
+from sktensor import dtensor
+basic_categories = ["4d4b7105d754a06374d81259",
+                     "4d4b7105d754a06379d81259",
+                     "4d4b7105d754a06377d81259",
+                     "4d4b7105d754a06378d81259",
+                     "4d4b7104d754a06370d81259",
+                     "4d4b7105d754a06375d81259",
+                     "4d4b7105d754a06376d81259",
+                     "4e67e38e036454776db1fb3a",
+                     "4d4b7105d754a06372d81259",
+                     "4d4b7105d754a06373d81259",
+                     "52f2ab2ebcbc57f1066b8b52"]
+category_names = {"4d4b7105d754a06374d81259": "Food",
+                     "4d4b7105d754a06379d81259": "Travel/transport",
+                     "4d4b7105d754a06377d81259": "Outdoors and Recreation",
+                     "4d4b7105d754a06378d81259": "Shop and Service",
+                     "4d4b7104d754a06370d81259": "Arts and Entertainment",
+                     "4d4b7105d754a06375d81259": "Professional",
+                     "4d4b7105d754a06376d81259": "Nightlife Spot",
+                     "4e67e38e036454776db1fb3a": "Residence",
+                     "4d4b7105d754a06372d81259": "College and University",
+                     "4d4b7105d754a06373d81259": "Event",
+                     "52f2ab2ebcbc57f1066b8b52": "Street"}
+# ----Helper functions---->
+# Get the number of checkins for each user
 def get_checkin_counts(users, checkins):
    return users.id.map(lambda id: len(checkins[checkins["userid"] == int(id)]))
+# Get all checkins form a particular user
 def get_user_checkins(userid, checkins):
    return checkins[checkins["userid"] == int(userid)]
+# Get the count of the checkins that occurred at each hour
 def get_hourly_checkins(checkins):
+    # Get a list of checkins per hour
    h = checkins.createdat.map(lambda t: datetime.fromtimestamp(t).hour).value_counts()
-    hourly_checkins = []
    missing = {}
+    # For any hours that dont have checkins make a dict with 0
    for i in range(0,24):
        if i not in h.index:
            missing[i] = 0
+    # Merge the lists and sort for right order
    return h.append(pd.Series(missing)).sort_index().values
-def get_activity_matrix(users, checkins):
+# Get the count of the checkins from each category
+def get_checkins_categories(checkins):
+    a = []
+    cat_counts = checkins.category.value_counts()
+    for cat in basic_categories:
+        if cat in cat_counts:
+            a.append(cat_counts[cat])
+        else:
+            a.append(0)
+    return a
+# Get the uppermost category  (climb the tree)
+def get_category_head(venue, categories):
+    cat = categories[categories["id"] == venue]
+    if cat.index.size == 0:
+        return
+    elif (cat["level"] == "0").bool() or (cat["parentID"] == "\\N").bool():
+        return cat
+    else:
+        return get_category_head(cat.parentID.to_string(index = False), categories)
+# <----Helper functions----
+# Find and plot the temporal/spatial lifestyles, n=number of lifestyles to find
+def plot_t_lifestyle(users, checkins, n=3):
+    model = NMF(n_components=n, init="nndsvd")
+    W = model.fit_transform(get_activity_matrix_t(users, checkins).T)
+    for i in range(0,n):
+        plt.plot(W.T[i] / max(W.T[i]))
+    H = model.components_
+    print("Avg err: ", model.reconstruction_err_ / users.shape[0])
+def plot_s_lifestyle(users, checkins, n=3):
+    model = NMF(n_components=n, init="nndsvd")
+    W = model.fit_transform(get_activity_matrix_s(users, checkins).T)
+    for i in range(0,n):
+        plt.plot(W.T[i] / max(W.T[i]))
+    H = model.components_
+    print("Avg err: ", model.reconstruction_err_ / users.shape[0])
+# Create Spatial and Temporal Activity Matrices
+def get_activity_matrix_t(users, checkins):
    user_count = len(users.index)
    user_ids = users["id"].values
    a = np.zeros([user_count, 24])
    for i in range(0, user_count):
        a[i] = get_hourly_checkins(get_user_checkins(user_ids[i], checkins))
-    return a
+    a = (a.T / a.max(1)).T
\ No newline at end of file
+    a = a[~np.isnan(a).any(axis=1)]
+    return a
+def get_activity_matrix_s(users, checkins):
+    user_count = users.index.size
+    user_ids = users["id"].values
+    a = np.zeros([user_count, len(basic_categories)])
+    for i in range(0, user_count):
+        a[i] = get_checkins_categories(get_user_checkins(user_ids[i], checkins))
+    a = (a.T / a.max(1)).T
+    a = a[~np.isnan(a).any(axis=1)]
+    return a
+# Create User Spatial Temporal activity Matrices 
+# Hourly (11 x 24)
+def get_activity_tensor_h(users, checkins):
+    cat_map = pd.Series({basic_categories[i]:i for i in range(0,len(basic_categories))})
+    T = np.zeros((users.index.size, 24, len(basic_categories)))
+    user_ids = users["id"].values
+    user_count = len(users.index)
+    for i in range(0, user_count):
+        user_checkins = get_user_checkins(user_ids[i], checkins)
+        for index, e in user_checkins.iterrows():
+            col = cat_map[e.category]
+            row = datetime.fromtimestamp(e.createdat).hour
+            T[i,row,col] += 1
+    return dtensor(T)
+# Daily (11 x 7)
+def get_activity_tensor_d(users, checkins):
+    cat_map = pd.Series({basic_categories[i]:i for i in range(0,len(basic_categories))})
+    T = np.zeros((users.index.size, 7, len(basic_categories)))
+    user_ids = users["id"].values
+    user_count = len(users.index)
+    for i in range(0, user_count):
+        user_checkins = get_user_checkins(user_ids[i], checkins)
+        for index, e in user_checkins.iterrows():
+            col = cat_map[e.category]
+            row = datetime.fromtimestamp(e.createdat).weekday()
+            T[i,row,col] += 1
+    return dtensor(T)
\ No newline at end of file
--- a/data/venuecategoriesbasic.csv
+++ b/data/venuecategoriesbasic.csv
--- a/run.py
+++ b/run.py
+import pandas as pd
+import checkin
+# Load in the data
+checkins = pd.read_csv("data/checkins.csv")
+users = pd.read_csv("data/users.csv")
+venues = pd.read_csv("data/venues.csv")
+categories = pd.read_csv("data/categories.csv")
+venue_categories = pd.read_csv("data/venuecategoriesbasic.csv")
+# Link the tables
+checkins = checkins.join(venue_categories.set_index("venue"), on="venueid")
+# Calculate the number of checkins for each user
+users["count"] = checkin.get_checkin_counts(users, checkins)
+# Get the users that have more than t checkins
+def get_top_users(users, t):
+    if "count" not in users.columns:
+        raise ValueError("No user counts under 'count' header")
+    return users[users.count > t]
\ No newline at end of file