Individual Exercise Solution

library(readtext) # easily read in text in directories
library(lubridate) # year function
library(quanteda) # corpus creation
library(stm) # structural topic models
# download speeches and metadata
download.file('http://erdos.ucd.ie/files/europarl/europarl-data-speeches.zip',
              'europarl-data-speeches.zip')

download.file('http://erdos.ucd.ie/files/europarl/europarl-metadata.zip',
              'europarl-metadata.zip')

# extract speeches and metadata
unzip('europarl-data-speeches.zip')
unzip('europarl-metadata.zip')
# recursively get filepaths for speeches from 09-12
speeches_paths <- list.files(path = c('europarl-data-speeches/2009',
                                      'europarl-data-speeches/2010'),
                             recursive = T, full.names = T)

# read in speeches
speeches <- readtext(speeches_paths)

speeches <- corpus(speeches)
metadoc(speeches, field = 'type') <- 'European Parliament Speech'

# read in speech docvars
speeches_dv <- read.delim('europarl-documents-metadata.tsv', sep = '\t')

# subset metadata to 2009-20012
speeches_dv <- speeches_dv[year(speeches_dv$date) >= 2009 &
                                   year(speeches_dv$date) <= 2010, ]

# read in MEP docvars
MEP_dv <- read.delim('europarl-meps-metadata.tsv', sep = '\t')

# merge MEP docvars onto speech metadata
dv <- merge(speeches_dv, MEP_dv, all.x = T,
            by.x = 'mep_ids', by.y = 'mep_id')

# merge docvars onto corpus
docvars(speeches) <- dv

# drop any texts with missing document variables
speeches <- corpus_subset(speeches, !is.na(country_short))

## subset to 10% of corpus
speeches_sub <- corpus_sample(speeches, size = floor(ndoc(speeches) / 10))

## create vector of corpus-specific stopwords
custom_stops <- c('mr', 'mrs', 'president', 'paragraph', 'resolut', 'statement',
                  'gentlemen', 'ladi', 'regard', 'mention', 'point', 'debate')

# create document feature matrix from corpus
speeches_dfm <- dfm(speeches_sub, tolower = T, stem = T,
                    remove = c(custom_stops, stopwords('english')),
                    remove_punct = T)

## convert dfm to stm object
speeches_stm <- convert(speeches_dfm, to = 'stm')

## remove tokens w/ fewer than 5 appearances
speeches_stm <- prepDocuments(speeches_stm$documents, speeches_stm$vocab,
                              speeches_stm$meta, lower.thresh = 5)
## Removing 8684 of 12751 terms (16169 of 275578 tokens) due to frequency 
## Your corpus now has 3334 documents, 4067 terms and 259409 tokens.
fit_stm_sw <- stm(documents = speeches_stm$documents, vocab = speeches_stm$vocab,
                  K = 15, prevalence = ~ country + group, seed = 374075,
                  data = speeches_stm$meta, sigma.prior = .1)
labelTopics(fit_stm_sw, n = 10)
## Topic 1 Top Words:
##       Highest Prob: like, want, can, commission, think, also, one, say, question, need 
##       FREX: want, thing, cours, talk, realli, said, think, commission, someth, question 
##       Lift: davi, score, wonder, paul, realli, sure, bütikof, chris, hans-pet, kamal 
##       Score: say, said, want, commission, realli, talk, someth, speak, question, cours 
## Topic 2 Top Words:
##       Highest Prob: parliament, european, vote, agreement, 2009, resolut, rule, council, write, favour 
##       FREX: item, statement, vote, written, motion, agreement, request, 2009, data, mep 
##       Lift: part-sess, thursday, 149, item, raül, statement, ukip, wednesday, 12.00, 142 
##       Score: vote, agreement, data, item, 149, amend, motion, resolut, statement, written 
## Topic 3 Top Words:
##       Highest Prob: european, union, citizen, eu, treati, parliament, lisbon, secur, new, relat 
##       FREX: treati, lisbon, turkey, citizen, partnership, visa, extern, cooper, secur, role 
##       Lift: delic, justa, ombudsman, palecki, paliad, severin, vinca, diamandouro, enlarg, sport 
##       Score: treati, lisbon, turkey, visa, citizen, moldova, russia, ombudsman, cooper, partnership 
## Topic 4 Top Words:
##       Highest Prob: financi, crisi, fund, econom, european, budget, will, need, financ, new 
##       FREX: financi, budget, tax, crisi, financ, recoveri, bank, rate, fund, eur 
##       Lift: a7-0246, giegold, haven, supervisori, auster, budget, financi, microfin, recoveri, tax 
##       Score: financi, fund, budget, crisi, euro, tax, financ, eur, recoveri, bank 
## Topic 5 Top Words:
##       Highest Prob: will, european, council, group, parliament, member, like, us, europ, today 
##       FREX: presid, fellow, czech, barroso, spanish, speech, minist, togeth, council, van 
##       Lift: czech, belgian, corien, merkel, president-in-offic, ride, rompuy, turm, vondra, barroso 
##       Score: presid, council, minist, barroso, czech, applaus, greec, rompuy, spanish, say 
## Topic 6 Top Words:
##       Highest Prob: report, propos, committe, rapporteur, also, like, public, support, parliament, includ 
##       FREX: committe, rapporteur, report, propos, document, compromis, internet, recommend, opinion, affair 
##       Lift: andersson, göran, mander, unreason, färm, lundgren, nil, procur, draftsman, concili 
##       Score: report, committe, propos, rapporteur, amend, internet, procur, compromis, market, public 
## Topic 7 Top Words:
##       Highest Prob: right, human, european, women, protect, fundament, equal, must, freedom, state 
##       FREX: women, immigr, convent, discrimin, traffick, terror, asylum, sexual, gender, penalti 
##       Lift: asylum, barrot, detent, digniti, exhibit, frontex, guantánamo, hautala, migratori, offenc 
##       Score: women, human, violenc, sexual, right, traffick, terror, immigr, crime, crimin 
## Topic 8 Top Words:
##       Highest Prob: member, state, european, commiss, inform, system, will, legisl, legal, regul 
##       FREX: agenc, inform, transpar, legal, applic, administr, legisl, system, provis, patient 
##       Lift: a7-0215, accredit, pharmacovigil, tremopoulo, acqui, gering, leaflet, oedenberg, transposit, web 
##       Score: inform, legal, transpar, regul, patient, legisl, agenc, procedur, applic, state 
## Topic 9 Top Words:
##       Highest Prob: european, social, write, polici, region, develop, parliament, eu, ppe, pt 
##       FREX: pt, social, educ, region, cohes, promot, nuno, feio, diogo, poverti 
##       Lift: atyp, feio, hübner, melo, b7-0466, carvalho, casa, diogo, fernand, grassroot 
##       Score: social, pt, region, cohes, write, educ, poverti, nuno, polici, develop 
## Topic 10 Top Words:
##       Highest Prob: product, european, consum, food, transport, agricultur, produc, market, will, sector 
##       FREX: product, food, agricultur, anim, farmer, transport, farm, safeti, consum, chain 
##       Lift: airport, chain, freight, meat, a7-0029, a7-0225, aquacultur, aviat, b7-0208, b7-0559 
##       Score: product, anim, food, consum, agricultur, farmer, transport, meat, clone, safeti 
## Topic 11 Top Words:
##       Highest Prob: state, european, member, union, countri, aid, eu, solidar, suppli, natur 
##       FREX: disast, aid, south, haiti, romania, solidar, suppli, gas, baltic, north 
##       Lift: pipelin, disast, gulf, haiti, janusz, korean, opel, a7-0112, b6-0003, baltic 
##       Score: disast, haiti, gas, suppli, solidar, aid, south, russia, state, romania 
## Topic 12 Top Words:
##       Highest Prob: energi, develop, chang, must, climat, will, global, european, strategi, research 
##       FREX: climat, emiss, energi, 2020, research, global, target, effici, water, co 
##       Lift: b7-0536, co, deforest, millennium, 2050, a6-0495, cancún, climat, danub, dioxid 
##       Score: energi, climat, emiss, 2020, research, develop, effici, strategi, environment, biodivers 
## Topic 13 Top Words:
##       Highest Prob: countri, european, polit, parliament, intern, govern, situat, freedom, conflict, human 
##       FREX: regim, israel, iran, kosovo, western, balkan, georgia, iraq, conflict, civilian 
##       Lift: belarusian, bosnia, dictatorship, gaza, iranian, isra, kosovo, kyrgyzstan, palestinian, protest 
##       Score: iran, kosovo, israel, iranian, iraq, balkan, prison, gaza, peac, elect 
## Topic 14 Top Words:
##       Highest Prob: worker, work, compani, employ, european, job, busi, market, small, labour 
##       FREX: worker, compani, enterpris, fisheri, small, employe, labour, busi, redund, medium-s 
##       Lift: bluefin, driver, enterpris, franz, obermayr, owner, reloc, self-employ, smes, tuna 
##       Score: worker, compani, employ, egf, redund, enterpris, fish, labour, unemploy, medium-s 
## Topic 15 Top Words:
##       Highest Prob: peopl, year, europ, mani, problem, countri, world, even, live, us 
##       FREX: peopl, languag, year, live, mani, china, bad, noth, problem, chines 
##       Lift: britain, latvian, teach, twenti, eija-riitta, korhola, brother, chines, mirski, fi 
##       Score: peopl, languag, young, china, world, live, year, let, europ, problem