Using Pyganalytics: https://github.com/chrpr/pyganalytics
Scrape the analytics using something like this (do this 4x, once for each quarter of the year, adjusting command as needed):
for i in {4..7}; do time python analytics.py -o ~/PATH/TO/DATA/OUTPUT/uniqueEvents_201$i\_01_01-201$i\_12_31.csv -c ~/PATH/TO/CONFIG/hathi_events_config.yml -f 201$i-01-01 -l 201$i-12-31; done
The yml config file is the following for the HathiTrust pageturner analytics account:
yml
query:
metrics: ga:uniqueEvents
dimensions: ga:pagePath
sort:
filters:
profile: 26478114
Note: I adjusted the delimiters used in the Pyganalitcs analytics.py
script because page paths in Google Analytics contain everything under the sun and I needed to amend to try and find something that would be unique enough to work as a field separator that Pandas can recognize. However, this also means that you have to use the Python parsing engine when reading the CSV into Pandas, but given that this is a one-time operation I think the tradeoff here of doing less surgery on the analytics CSVs is worth the potential slowdown here.
I also edited the analytics.py
script to run daily instead of weekly in order to try and capture more granular results. See notes on how to do that in the Pyganalytics readme.
Notes:
Also something to explore: there are a limited set of results with zero pageviews, even just limiting to event triggers. But since even with an event count of zero, something is triggering the sending of these pages to analytics, I'm opting to include these at the moment.
!pip3 install sqlalchemy
!pip3 install rpy2
#For inline matplotlib plots
%matplotlib inline
#For data manipulation and and analysis
import pandas as pd
import dask.dataframe as dd #USE `pip3 install 'dask[dataframe]'`
from dask.diagnostics import ProgressBar
#For plotting and dataviz
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context("talk", font_scale=1.2)
#So we can use the better ggplot2 R packages to create some of the diagrams below
from rpy2.robjects import r, pandas2ri
pandas2ri.activate()
%load_ext rpy2.ipython
%R library(tidyverse)
#Dask progress bar so we can get a view into long-running processes
pbar = ProgressBar()
pbar.register()
def extract_ids():
'''
Uses Dask to extracts HathiTrust IDs from the raw analytics logs and writes them to a series of CSVs
'''
df = dd.read_csv('./data/HathiTrust_unique_events_201*.csv')
#Mix of my regex experimentation and pattern supplied by Angelina Z at Hathi:
pattern = '(?:id=|[a-z0-9]\/)([a-z][a-z0-9]{1,3}\.\$?[a-z0-9._:\/\-]+)'
#Extract the ID matches into a new column
df['id'] = df['ga:pagePath'].str.extract(pattern, expand=False)
#Limit to just rows with ID matches
df = df[df['id'].notnull()]
#Remove rows that have 'skin=crms'
df = df[df['ga:pagePath'].str.contains('skin=crms') == False]
#Remove some junk punctuation from the end of some IDs
df['id'] = df['id'].str.rstrip('._#')
#Rename 'ga:uniqueEvents' column to 'count'
df = df.rename(columns={'ga:uniqueEvents': 'count'})
#Write the results to csvs
df.to_csv('./data/all_ids_*.csv', index=False)
#This takes roughly 5min to run
%time extract_ids()
def ids_count():
'''
Uses output csvs of extract_ids() function to create a tuple of volume IDs paired with pageview counts
(either unique or aggregate pageview counts depending on how analytics were scraped)
and writes them to a CSV
'''
#Read in all the files with the extracted IDs and their counts
df = dd.read_csv('./data/all_ids_*.csv')
#Group that data by volume identifier, and then record the sum total of all hit counts
ids = df.groupby(by=['id'])['count'].sum()
#Turn dask dataframe into pandas dataframe so we can use sort_values (not implemented yet in dask)
ids = ids.compute()
#Do some column naming, and then export to a CSV
ids.index.name = 'id'
ids.columns = ['count']
ids = ids.sort_values(ascending=False).to_csv('./csv/all_counts_sorted.csv', header=True)
#This takes roughly 12min to run
%time ids_count()
pd.options.mode.chained_assignment = None
def fix_dollar_sign_ids():
'''
Need to Fix UC $ issue, and roll up the total counts so they're only counted
as single items with all hits. There's some ugly pandas in here, could certainly
be more effficient but it's quick enoughthat I'm calling it done for now
'''
#Need numpy briefly to do the conditional check later using .where() method
import numpy as np
#Import the extracted IDs and counts
df = pd.read_csv('./csv/all_counts_sorted.csv')
#New dataframe of just the potentially affected dollar sign IDs
dollars = df[df['id'].str.contains('\$')]
#New column with the id version minus dollar sign
dollars.loc[:,'fixed_id'] = dollars['id'].copy().str.replace('$','')
#Merge with original data to get access to all counts we need to sum
merged = df.merge(dollars, left_on='id', right_on='fixed_id', suffixes=['_df','_dollars'])
#Sum the hit counts for the dollar sign IDs and the non dollar sign IDs
merged.loc[:,'total'] = merged['count_df'] + merged['count_dollars']
#Merge the totals with the original dataset again, but in a new df just to be careful
df2 = df.merge(merged, how='outer', left_on='id', right_on='id_dollars')
#Update the original 'count' column to the holistic total where needed
df2.loc[:, 'count'] = np.where(df2['total'].notnull() == True, df2['total'], df['count'])
#Remove the non-dollar sign IDs from the df, since the dollar-sign-id totals now reflect
#the holistic sum. Also removes the now extraneous extra columns
df2 = df2[df2['id'].isin(df2['id_df']) == False].copy()[['id','count']]
#Write it all out to a new csv
df2.sort_values('count', ascending=False).to_csv('./csv/all_counts_sorted_dollar_fixed.csv')
#This takes like 35 seconds to run
%time fix_dollar_sign_ids()
#Set up the postgres connection
import sqlalchemy as sa
def connect(user, password, db, host='localhost', port=5432):
'''Returns a connection and a metadata object'''
# We connect with the help of the PostgreSQL URL
url = 'postgresql://{}:{}@{}:{}/{}'
url = url.format(user, password, host, port, db)
# The return value of create_engine() is our connection object
con = sa.create_engine(url, client_encoding='utf8')
# We then bind the connection to MetaData()
meta = sa.MetaData(bind=con).reflect()
return con, meta
con, meta = connect('postgres', '', 'hathifiles')
#Import the Hathifiles into a postgres database
'''
This cell does require ones step not included here which is adding the header row
to the txt file with the column names, which I do in an inelegant but fine way:
- Copy row headers, including tab delimiters, to new file
- Append text of massive hathi text file (~4gb) to that new file
`cat hathi_full_20171101.txt >> headers_for_hathi_full_20171101.txt`
- Delete old file, and rename new file same as the old (but now includes column headers)
'''
#Struggled for a long time with this, but turns out the delimiter needs to be r'\t', not just '\t'
#hathi_data = pd.read_csv('./data/hathifiles/hathi_full_20180501.txt', engine='python', delimiter=r'\t', encoding='utf-8', chunksize=1000000)
def postgres_import():
i = 0
for chunk in hathi_data:
chunk = chunk[['id','access', 'rights', 'hathitrust_record_number', 'enumeration_chronology', 'source', 'source_institution_record_number', 'oclc_numbers', 'isbns', 'issns', 'lccns', 'title', 'imprint', 'rights_determination_reason_code', 'date_of_last_update', 'government_document', 'publication_date', 'publication_place' ,'language', 'bibliographic_format', 'collection_code', 'content_provider_code', 'responsible_entity_code', 'digitization_agent_code']]
try:
chunk.to_sql('hathifiles', con, if_exists='append')
print (i, chunk.index[0])
i += 1
except:
print (chunk.index[0])
#This takes roughly 2 hours 10min to run on my macbook air
#%time postgres_import()
def get_access_and_date():
gf = pd.read_csv('./csv/all_counts_sorted_dollar_fixed.csv')
gf['access'] = ''
gf['date'] = ''
gf['title'] = ''
gf['oclc'] = ''
# gf['format'] = ''
# gf['pub_place'] = ''
print (len(gf))
header = 'id,title,access,date,oclc,gov_doc,publication_place,language,bib_format'
text_file = open("./csv/all_id_title_access_date_oclc_languages_formats.csv", "w")
text_file.write(header+'\n')
text_file.close()
for i in range(0,len(gf),250000):
xf = gf[i:(i+250000)]
ids = []
for index, row in xf.iterrows():
ids.append(row['id'])
x = pd.read_sql_query("select id, title, access, publication_date, oclc_numbers, government_document, publication_place, language, bibliographic_format \
from hathifiles where id in"+str(tuple(ids)), con=con)
x.to_csv('./csv/all_id_title_access_date_oclc_languages_formats.csv', mode='a', encoding='utf-8', header=False, index=False)
if i % 10000 == 0:
print (i)
#This takes about 33min if the postgres table has an index on "id"
#%time get_access_and_date()
Some cells below are just checks to make sure outputs look roughly correct, others set up some data for visualization or other manipulation
#Read our various CSVs into dataframes so we can work with them
counts = pd.read_csv('./csv/all_counts_sorted.csv')
d_counts = pd.read_csv('./csv/all_counts_sorted_dollar_fixed.csv')
full = pd.read_csv('./csv/all_id_title_access_date_oclc_languages_formats.csv')
# Gut check on dollar sign fix and missing volumes
cs = set(counts['id'])
ds = set(d_counts['id'])
fs = set(full['id'])
#Total number of dollar sign ids fixed
print("dollar sign ids fixed: %s" % (len(cs - ds)))
'''
To get a sense if we've missed things with the above data transformations
This spits out the things that the parsing found as IDs, but couldn't find in the HathiFiles
Generally, there are a few hundred here, things that have been added to Hathi via ingest and therefore
Are available to analytics events, but haven't had their metadata added to the monthly Hathifile updates
Gut checking, a delta of less than a thousand a month seems ok, and not something I'm super worried about
in terms of skewing data
'''
diff = ds - fs
missing = []
def print_missing():
for d in diff:
if "uc1" not in d:
missing.append(d)
else:
pass
print ("Total non-dollar sign IDs in data, but not found in hathifiles: %s" % (len(missing)))
for item in missing:
print (item)
#print_missing()
'''This merges the metadata extracted from the Hathifiles with the top counts from the analytics,
and spits out the list of the most viewed items in HathiTrust
But this could easily be tweaked to show top NYPL items, top items that were denied access (excellent ILL candidates),
top items published in a given country, etc. '''
counts_ids = full.merge(d_counts, on='id', suffixes=['_full','_counts'])
counts_ids = counts_ids.drop('Unnamed: 0', axis=1)
#Double check -- how many items had event triggers recorded in order to appear in the analytics, but had the count recorded as zero?
zeroes = counts_ids[counts_ids['count'] < 1]
print (len(zeroes))
#full = pd.read_csv('./csv/all_id_title_access_date_oclc.csv')
allow = full[full.access == 'allow']
deny = full[full.access == 'deny']
print ("%s total volumes that triggered analytics events in the collected data" % len(full))
print ("%s total open volumes have triggered analytics events in the collected data" % len(allow))
print ("%s total limited view volumes have triggered analytics events in the collected data" % len(deny))
#Top 25 titles in Hathi
counts_ids[['id','title','date','count']].sort_values('count', ascending=False).head(25)
#Top 25 limited view titles in Hathi
counts_ids[counts_ids.access == 'deny'][['id','title','date','count']].sort_values('count', ascending=False).head(25)
#Top 25 Hathi volumes scanned from NYPL collections
counts_ids[counts_ids['id'].str.startswith('nyp') == True][['id','title','date','count']].sort_values('count', ascending=False).head(25)
N.B. See the interactive dataviz for a larger and more useful view of this data
all_years = full[(full.date > 1799) & (full.date < 2018)].groupby('date')['id'].count()
allow_years = allow[(allow.date > 1799) & (allow.date < 2018)].groupby('date')['id'].count()
deny_years = deny[(deny.date > 1799) & (deny.date < 2018)].groupby('date')['id'].count()
#Plots publication date of volumes with analytics events, as full-view vs limited view volumes
#Figure 1
#all_years.plot()
allow_years.plot(figsize=(10,7))
deny_years.plot(figsize=(10,7))
plt.xlabel('Date of publication')
plt.ylabel('total number of volumes')
plt.title('"Full View" (blue) and "Limited View" (orange) clicked volumes in HathiTrust');
#This grabs all dates from the postgres DB to some data analysis and histograms, etc.
all_dates = pd.read_sql_query("SELECT DISTINCT publication_date, count(publication_date) \
from hathifiles GROUP BY publication_date ORDER BY publication_date ASC", con=con)
all_dates_a = pd.read_sql_query("SELECT DISTINCT publication_date, count(publication_date) \
from hathifiles WHERE access = 'allow' GROUP BY publication_date ORDER BY publication_date ASC", con=con)
all_dates_d = pd.read_sql_query("SELECT DISTINCT publication_date, count(publication_date) \
from hathifiles WHERE access = 'deny' GROUP BY publication_date ORDER BY publication_date ASC", con=con)
dates = all_dates[(all_dates.publication_date < 2018) & (all_dates.publication_date > 1799)]
dates_a = all_dates_a[(all_dates_a.publication_date < 2018) & (all_dates_a.publication_date > 1799)]
dates_d = all_dates_d[(all_dates_d.publication_date < 2018) & (all_dates_d.publication_date > 1799)]
dates.index = dates.publication_date
dates_a.index = dates_a.publication_date
dates_d.index = dates_d.publication_date
dates_a.loc[:,'accessed'] = allow_years
dates.loc[:,'full view'] = dates_a['count']
dates.loc[:,'limited view'] = dates_d['count']
dates.loc[:,'full view accessed'] = allow_years
dates.loc[:,'limited view attempted'] = deny_years
#dates_a
#dates
#To export all counts, full view counts, and limited view counts to a CSV for easier use later:
#dates.to_csv('./csv/all_years_data.csv')
Below takes a look at the total number of openly available volumes, and what percentage have been accessed since mid-2013 (figure 2)
#Calculate raw utilization rate
util = dates['full view accessed'] / dates['count']
dates_a.copy().loc[:, 'percent'] = dates_a.accessed.copy() / dates_a['count'].copy()
print ("The overall average utilization rate for open volumes 1800-2017 is: {0:.2f}%\n".format(100*dates_a['percent'].mean()))
print ("The average utilization rate for open volumes 1800-1875 is: {0:.2f}%\n".format(100*dates_a[1800:1875]['percent'].mean()))
print ("The average utilization rate for open volumes 1876-1922 is: {0:.2f}%\n".format(100*dates_a[1876:1922]['percent'].mean()))
print ("The average utilization rate for open volumes 1923-1962 is: {0:.2f}%\n".format(100*dates_a[1923:1962]['percent'].mean()))
print ("The average utilization rate for open volumes 1963-2017 is: {0:.2f}%\n".format(100*dates_a[1963:2017]['percent'].mean()))
#plot open volume utilization per year
#Figure 2
dates_a['percent'].plot(figsize=(10,7)).set_ylim([0,1])
plt.xlabel('Date of publication')
plt.ylabel('Percentage of open volumes accessed')
plt.title('"Full View" volumes utilization per year');
# #Plot publication year distribution of accessed volumes (blue, left) against utilization rate by year (red, right)
# import matplotlib.ticker as ticker
# fig, ax1 = plt.subplots(figsize=(10,7))
# ax1.plot(allow_years, color='b')
# ax1.set_ylabel('# of works', color='b')
# ax1.tick_params('y', colors='b')
# #ax1.set_yticks([0,10000])
# ax1.set_ylim([0,70000])
# ax2 = ax1.twinx()
# ax2.plot(util, color='r')
# ax2.set_ylabel('% utilized', color='r')
# ax2.tick_params('y', colors='r')
# ax2.set_ylim([0,1])
# ax2.yaxis.set_major_locator(ticker.MultipleLocator(1 / 7))
While the above utilization plot (figure 2) takes a binary view of each volume in a given year -- either clicked on, or not clicked on -- the following plot (figure 3) shows the average views per item per year (ending at publication date 2015 to exclude some outliers that skew because of limited data and number of hits in the analytics).
Even more clearly than the above figure, this demonstrates that the materials from 1923 to 1963, corresponding to the CRMS review period, are disproportionately used while accounting for the variable number of volumes in a given year.
#Charting average view counts per volume per year
#Figure 3
avg_usage = counts_ids[(counts_ids.date > 1799) & (counts_ids.date < 2015)].groupby('date')['count'].mean()
#avg_usage_non_0 = counts_ids[(counts_ids.date > 1799) & (counts_ids.date < 2015) & (counts_ids['count'] > 0)].groupby('date')['count'].mean()
avg_usage.plot(figsize=(10,7))
#avg_usage_non_0.plot(figsize=(10,7))
plt.xlabel('Date of publication')
plt.ylabel('Average events per volume')
plt.title('Average usage per volume per year');
avg_usage_a = counts_ids[(counts_ids.date > 1799) & (counts_ids.date < 2015) & (counts_ids.access == 'allow')].groupby('date')['count'].mean()
avg_usage_d = counts_ids[(counts_ids.date > 1799) & (counts_ids.date < 2015) & (counts_ids.access == 'deny')].groupby('date')['count'].mean()
If we limit to only open volumes, plotting average usage shows even more distinct regions (figure 4):
#Figure 4
avg_usage_a.plot(figsize=(10,7))
plt.xlabel('Date of publication')
plt.ylabel('Average events per "Full View" volume')
plt.title('Average usage per "Full View" volume per year');
pdata = pd.DataFrame()
pdata.loc[:, 'all'] = dates['count']
#pdata.loc[:, 'accessed'] = allow_years
#pdata.loc[:, 'denied'] = deny_years
The cell below (figure 5) shows the publication date distribution of everything in the HathiTrust corpus
#Figure 5
import seaborn as sns
pdata.plot(figsize=(11.5,7))
plt.xlabel('Date of publication')
plt.ylabel('Total number of volumes')
plt.title('All HathiTrust volumes by publication year');
For comparison, figure 6 is the distribution of all publications in WorldCat, as estimated by Brian F. Lavoie and Roger C. Schonfeld in a 2006 Journal of Electronic Publishing article titled "Books without Boundaries: A Brief Tour of the System-wide Print Book Collection"
For context, figure 7 shows the same data shown above in the Hathi publication date plot, but on the same scale as the OCLC data:
#Figure 7
pdata.plot(figsize=(11.5,7)).set_ylim(0, 700000)
plt.xlabel('Date of publication')
plt.ylabel('Total number of volumes')
plt.title('All HathiTrust volumes by publication year');
In contrast to that, figure 8 below shows the plot of volumes in HathiTrust that triggered an analytics event of some kind on the same scale.
#Figure 8
all_years.plot(figsize=(11.5,7)).set_ylim(0, 700000)
plt.xlabel('Date of publication')
plt.ylabel('Total number of volumes')
plt.title('All HathiTrust volumes with analytics events by publication year');
Because items with a publication date of '9999' are not included in the range of useful histograms, useful to do a separate analysis of items with this very specific publication date, most of which are serials or other ongoing publications.
all_ongoing_a = pd.read_sql_query("SELECT publication_date, count(id) \
from hathifiles where publication_date = 9999 and access = 'allow' GROUP BY publication_date", con=con)
all_ongoing_d = pd.read_sql_query("SELECT publication_date, count(id) \
from hathifiles where publication_date = 9999 and access = 'deny' GROUP BY publication_date", con=con)
all_ongoing = pd.read_sql_query("SELECT publication_date, count(id) \
from hathifiles where publication_date = 9999 GROUP BY publication_date", con=con)
#Ongoing volumes that triggered analytics events
ongoing_events = full[(full.date == 9999)].groupby('date')['id'].count()
ongoing_events_a = full[(full.date == 9999) & (full.access == 'allow')].groupby('date')['id'].count()
ongoing_events_d = full[(full.date == 9999) & (full.access == 'deny')].groupby('date')['id'].count()
print ("There are a total of %s volumes with a publication date of '9999' listed in the Hathifiles \n" % all_ongoing['count'].iloc[0])
print ("There are %s volumes with a publication date of '9999' that triggered analytics events" % ongoing_events.iloc[0])
print ("There are %s 'Full View' volumes with a publication date of '9999' that triggered analytics events" % ongoing_events_a.iloc[0])
print ("There are %s 'Limited View' volumes with a publication date of '9999' that triggered analytics events" % ongoing_events_d.iloc[0])
#Figure 9
ongoing_plot = pd.DataFrame()
ongoing_plot.loc['full view', 'all possible volumes\n with "9999"'] = all_ongoing_a['count'].iloc[0]
ongoing_plot.loc['limited view', 'all possible volumes\n with "9999"'] = all_ongoing_d['count'].iloc[0]
ongoing_plot.loc['full view', 'Accessed volumes\n with "9999"'] = ongoing_events_a.iloc[0]
ongoing_plot.loc['limited view', 'Accessed volumes\n with "9999"'] = ongoing_events_d.iloc[0]
ongoing_plot.T.plot(kind='barh', color=['g','r'], stacked=True, legend=True, width=.4, linewidth=.4, figsize=(11.5,7))
plt.xlabel('Volumes')
plt.title('Volumes with 9999 for publication year');
from __future__ import division
# Determine ongoing serials utilization rate
print( "%s of all possible 9999s are open volumes" % all_ongoing_a['count'].iloc[0])
print( "%s of all open 9999 volumes have been accessed" % ongoing_events_a.iloc[0])
print("The overall utilization rate of open 9999s in HathiTrust is {0:.2f}%".format(100.*(int(ongoing_events_a.iloc[0]) / int(all_ongoing_a['count'].iloc[0]))))
#Top serials and ongoing publications with publication date of '9999'
print("Top Serials and other 9999 volumes:")
counts_ids[(counts_ids.date == 9999) & (counts_ids.access == 'allow') & (counts_ids['count'] > 0)].sort_values('count', ascending=False)[['id','title','date','count']].head(20)
#Grab the numbers for language of publication, as fas as the Hathifiles know
%time all_languages = pd.read_sql_query("SELECT language, count(language), publication_date \
from hathifiles GROUP BY language, publication_date ORDER BY publication_date ASC", con=con)
#Limit to just 1800-2017, to match other data
all_languages = all_languages[(all_languages.publication_date < 2018) & (all_languages.publication_date > 1799)]
#Reshape to get counts by date
all_languages = all_languages.sort_values(['publication_date','count'])
used_languages = counts_ids[(counts_ids.date < 2018) & (counts_ids.date > 1799)][['id','access','date','language']]
used_m = used_languages.groupby(['language','date'])['id'].count()
used_mm = used_languages.groupby(['language'])['id'].count()
used_mm = used_mm.sort_values(ascending=False)
# Similar reshape for all languages
aa = all_languages.groupby('language')['count'].sum()
aa = aa.sort_values(ascending=False)
#If we just want to look at top 20 languages by holding, use ab
ab = aa[aa.index[0:20]]
If we compare the "publication language" for all the items in the Hathifiles against the languages of just the volumes used since 1/1/2014, we can get a sense of which languages dominate the collection. In addition, we can then get a sense of which languages are under-utilized or over-utilized, relative to their overall holdings. (Figures 10 and 11)
#Figure 10
#Plotting the top 20 languages in terms of overall holdings
ab.sort_values(ascending=True).plot(kind='barh',figsize=(16,14))
plt.xlabel('Number of volumes in a given language')
plt.ylabel('Language')
plt.title('Top 20 Held Languages');
#Figure 11
#Plotting the top 20 languages in terms of used volumes
used_mm[0:20].sort_values(ascending=True).plot(kind='barh',figsize=(16,14))
plt.xlabel('Number of volumes in a given language used')
plt.ylabel('Language')
plt.title('Top 20 Most Used Languages');
#Merge used and all volumes data, and prep for porting to R
lf1 = pd.DataFrame({'series':aa})
lf2 = pd.DataFrame({'series':used_mm})
#merged_langs = aa.merge(used_mm)
lf = lf1.merge(lf2, left_index=True, right_index=True, how='outer')
lf.rename(columns={'series_x': 'total_volumes', 'series_y': 'used_volumes'}, inplace=True)
lf.loc[:, 'percent_used'] = lf.used_volumes / lf.used_volumes.sum()
lf.loc[:, 'percent_all'] = lf.total_volumes / lf.total_volumes.sum()
lf.loc[:, 'language'] = lf.index
lf = lf.sort_values('total_volumes', ascending=False)
# Port the top 50 languages, by holdings, to an R dataframe
rlf = pandas2ri.py2ri(lf[0:50])
The following two charts (figures 12 and 13) look at how the percentage of overall holdings compares to the percentage of used volumes for a given language.
The first chart (figure 12) compares usage vs holdings as a scatter plot, and includes a line marking the 1:1 slope -- therefore, languages above this line are disproportionately used relative to their volume in the overall corpus, while languages below the line are relatively under-utilized. Meanwhile, distance from the line indicates the extent of the disparity, so for example Ancient Greek (closer to the lower left) has relatively few holdings, but is heavily used given the volume, whereas there are significant holdings in Russian, Chinese, and Japanese (all closer to the top right) that are seemingly under-utilized.
For both charts, only the top 50 languages by holding are considered. Both charts use log scales to make the trends more visible.
%%R -w 800 -h 600 -u px -i rlf
#Figure 12
# Plotting the top 50 languages in terms of overall holdings, comparing usage vs holdings by percent (log scale)
# N.B. This excludes English, but English can be included by commenting out the line indicated below
library(ggrepel)
library(scales)
library(reshape2)
set.seed(42)
rlf %>%
filter(language != 'eng') %>% ### COMMENT THIS LINE OUT TO ADD 'eng' BACK TO THE PLOT
ggplot(aes(x=percent_all, y=percent_used, color=language))+
geom_point(size=4)+
geom_abline(intercept = 0, slope = 1)+
geom_text_repel(aes(percent_all, percent_used, label = language), size=8)+
theme_minimal(base_size = 18) +
theme(legend.position="none")+
scale_x_log10(labels = percent)+
scale_y_log10(labels = percent)+
xlab('Percent of All Volumes') + ylab('Percent of Used Volumes')+
ggtitle('Usage vs. Holdings Comparison')
%%R -w 800 -h 1000 -u px
#Figure 13
rlf %>%
mutate(indicator = ifelse( percent_all < 0.9*percent_used, 'Most heavily used',
ifelse(percent_all > 2*percent_used,
"Most relatively under-used",
"Roughly average use"))) %>%
select(-total_volumes, -used_volumes) %>%
rename(., 'Percent of \nUsed Volumes' = percent_used, 'Percent of \nAll Volumes' = percent_all) %>%
melt( variable.name = "variable_percent",
value.names = "value_percent",
id.vars = c("language", "indicator")) ->
rlf_melted
rlf_melted %>%{
ggplot(., aes(x = as.factor(reorder(variable_percent, desc(variable_percent))), y = value, group = language, color = indicator, label = language)) +
geom_line(size = 2, alpha = 3/10) +
geom_point(size = 5, alpha = 10/10) +
geom_text_repel(aes(),
show_guide = F,
size=6,
nudge_x = 0.25,
direction = "y",
hjust = 0,
subset(., indicator != 'Roughly average use' & variable_percent != 'Percent of \nAll Volumes')
)+
geom_text_repel(aes(),
show_guide = F,
size=6,
nudge_x = -0.25,
direction = "y",
hjust = 1,
subset(., indicator != 'Roughly average use' & variable_percent != 'Percent of \nUsed Volumes' )
)+
theme_minimal(base_size = 18) +
scale_color_manual(values=c("darkgreen","#ff0000", '#dddddd'))+
scale_y_log10(labels = percent)+
guides(color=guide_legend(title="Use Relative to \nOverall Holdings:"))+
xlab("")+ylab('Percent')+
ggtitle('Relative Over- and Under-Utilization')
}
# Fun for later:
# #To extract search queries
# def queries():
# #Note: this extract only the first match of the pattern in the source string, but for our purposes this should be both fine and correct
#
# df = dd.read_csv('./data/uniqueEvents_201*.csv', delimiter='\|\~', engine='python')
# pattern = '(?P<Query>q1=[^\&\n\|]*|lookfor=[^\&\n\|]*)'
# t = df['ga:pagePath'].str.extract(pattern, expand=True)
# df_queries = t[t['Query'].notnull()]
# df_queries.to_csv('./csv/queries.csv', index=False)
# # %time queries()
#With some more parsing for common bigrams and unigrams, and using this repo: https://github.com/amueller/word_cloud
#I was able to produce what I think is a pretty compelling version of a word cloud:
(click through to see larger version)