Archive

Monthly Archives: August 2013

Twitter is a great network data source. Using R, you can easily connect to Twitter, download a bunch of tweets and analyze them. In this case, I wanted to focus on replies and retweets. Below find a summary of reply and retweet networks (complete code here)

– The script connects to Twitter and downloads 1.000 tweets (one hour delay) of any given term –

tweets <- searchTwitter(“Spanair”, n=1000,language=”es”)

head(tweets)

[[1]] [1] “MadridNoticias: Afectados por el accidente de Spanair luchan por mantener vivo el recuerdo cinco años después http://t.co/T2Z0ntntGV #TeleMadrid”

[[2]] [1] “Franches98: El accidente que quebró a Spanair: Investigadores del grupo de investigación Applied Economics and Management … http://t.co/phZ1rcF5h7”

[[3]] [1] “FansDelPoeta_: El accidente que quebró a Spanair: Investigadores del grupo de investigación Applied Economics and Management de la Universidad de Se…”

[[4]] [1] “Ariel_x31: El accidente que quebró a Spanair: Investigadores del grupo de investigación Applied Economics and Management … http://t.co/p3pchacEUu”

[[5]] [1] “aracelii92: RT @A3Noticias: Hoy se cumplen cinco años del accidente de Spanair que costó la vida a 154 personas [VÍDEO] http://t.co/5H7qQgTEAC”

[[6]] [1] “Oye_Niinii: El accidente que quebró a Spanair: Investigadores del grupo de investigación Applied Economics and Management … http://t.co/e1jCrsdvMT”

df_tweets <- twListToDF(tweets)

str(df_tweets)

data.frame’: 1000 obs. of  12 variables:

$ text        : chr  “Afectados por el accidente de Spanair luchan por mantener vivo el recuerdo cinco años después http://t.co/T2Z0ntntGV #TeleMadri”| __truncated__ “El accidente que quebró a Spanair: Investigadores del grupo de investigación Applied Economics and Management .

$ favorited   : logi  FALSE FALSE FALSE FALSE FALSE FALSE …

$ replyToSN : chr NA NA NA NA …

$ created : POSIXct, format: “2013-08-20 08:11:18” “2013-08-20 08:11:12” …

$ truncated : logi FALSE FALSE FALSE FALSE FALSE FALSE …

$ replyToSID : chr NA NA NA NA …

$ id : chr “369733367031283712” “369733343165689857” “369733322261274625” “369733285590478848” …

$ replyToUID : chr NA NA NA NA …

$ statusSource: chr “<a href=\”http://dlvr.it\” rel=\”nofollow\”>dlvr.it</a>” “<a href=\”http://twitterfeed.com\” rel=\”nofollow\”>twitterfeed</a>” “<a href=\”http://twitterfeed.com\” rel=\”nofollow\”>twitterfeed</a>” “<a href=\”http://twitterfeed.com\” rel=\”nofollow\”>twitterfeed</a>” …

$ screenName : chr “MadridNoticias” “Franches98” “FansDelPoeta_” “Ariel_x31” …

$ retweetCount: num 0 0 0 0 73 0 1 0 0 0 …

$ retweeted : logi FALSE FALSE FALSE FALSE FALSE FALSE …

replyto <- df_tweets[3]

user <- df_tweets[10]

df_reply <- cbind(replyto,user)

str(df_reply)

‘data.frame’: 1000 obs. of 2 variables:

$ replyToSN : chr NA NA NA NA …

$ screenName: chr “MadridNoticias” “Franches98” “FansDelPoeta_” “Ariel_x31” …

head(df_reply)

replyToSN screenName

1 <NA> MadridNoticias

2 <NA> Franches98

3 <NA> FansDelPoeta_

4 <NA> Ariel_x31

5 <NA> aracelii92

6 <NA> Oye_Niinii

replies <- subset(df_reply,replyToSN!=”<NA>”)

str(replies)

‘data.frame’: 20 obs. of 2 variables:

$ replyToSN : chr “elmundoes” “A3Noticias” “elmundoes” “A3Noticias” …

$ screenName: chr “jgrvilla” “OMerinoG” “jgrvilla” “OMerinoG” …

head(replies)

replyToSN screenName

49 elmundoes jgrvilla

90 A3Noticias OMerinoG

text_1 <- subset(df_tweets,replyToSN!=”<NA>”)

str(text_1)

data.frame’: 20 obs. of  12 variables:

$ text : chr ““@elmundoes: 5º aniversario del accidente de Spanair. Un estudio lo vincula con la quiebra de la compañís aérea. http://t.co/73″| __truncated__ ““@A3Noticias: Hoy se cumplen cinco años del accidente de Spanair que costó la vida a 154 personas [VÍDEO] http://t.co/S5wH6TE90″| __truncated__ ““@elmundoes: 5º aniversario del accidente de Spanair. Un estudio lo vincula con la quiebra de la compañís aérea. http://t.co/73″| __truncated__ ““@A3Noticias: Hoy se cumplen cinco años del accidente de Spanair que costó la vida a 154 personas [VÍDEO] http://t.co/S5wH6TE90″| __truncated__ …

$ favorited : logi FALSE FALSE FALSE FALSE FALSE FALSE …

$ replyToSN : chr “elmundoes” “A3Noticias” “elmundoes” “A3Noticias” …

……………..

ind <- c(1,3,10)

replies_def <- text_1[ind]

str(replies_def)

‘data.frame’: 20 obs. of 3 variables:

$ text : chr ““@elmundoes: 5º aniversario del accidente de Spanair. Un estudio lo vincula con la quiebra de la compañís aérea. http://t.co/73″| __truncated__ ““@A3Noticias: Hoy se cumplen cinco años del accidente de Spanair que costó la vida a 154 personas [VÍDEO] http://t.co/S5wH6TE90″| __truncated__ ““@elmundoes: 5º aniversario del accidente de Spanair. Un estudio lo vincula con la quiebra de la compañís aérea. http://t.co/73″| __truncated__ ““@A3Noticias: Hoy se cumplen cinco años del accidente de Spanair que costó la vida a 154 personas [VÍDEO] http://t.co/S5wH6TE90″| __truncated__ …

$ replyToSN : chr “elmundoes” “A3Noticias” “elmundoes” “A3Noticias” …

$ screenName: chr “jgrvilla” “OMerinoG” “jgrvilla” “OMerinoG” …

df.g <- graph.data.frame(d = replies, directed = FALSE)

plot(df.g, vertex.size=4, vertex.label=V(df.g)$name,vertex.color=”orange”,vertex.label.color=”black”, vertex.frame.color=”white”, edge.color=”grey”,edge.arrow.size=0.01, rescale=TRUE,vertex.label=NA, vertex.label.dist=0.0,vertex.label.cex=0.5, add=FALSE, vertex.label.font=.001)

replies_spanair

rts <- grep(“^rt @[a-z0-9_]{1,15}”, tolower(df_tweets$text), perl=T, value=T)

rt.sender <- tolower(as.character(df_tweets$screenName[grep(“^rt @[a-z0-9_]{1,15}”, tolower(df_tweets$text), perl=T)]))

rt.receiver <- gsub(“^rt @([a-z0-9_]{1,15})[^a-z0-9_]+.*$”, “\\1”, rts, perl=T)

rt.sender[rt.sender==””] <- “<NA>”

rt.receiver[rt.receiver==””] <- “<NA>”

rts.df <- data.frame(rts,rt.sender, rt.receiver)

str(rts.df)

‘data.frame’: 480 obs. of 3 variables:

$ rts : Factor w/ 36 levels “rt @a3noticias: hoy se cumplen cinco años del accidente de spanair que costó la vida a 154 personas [vídeo] http://t.co/5h7qqgt”| __truncated__,..: 1 16 23 20 5 24 10 25 8 35 …

$ rt.sender : Factor w/ 44 levels “adol_izquierdo”,..: 6 4 37 10 8 42 5 41 38 26 …

$ rt.receiver: Factor w/ 30 levels “a3noticias”,”abc_es”,..: 1 13 19 16 4 20 8 21 7 29 …

ddf <- rts.df[c(2:3)]

str(ddf)

‘data.frame’: 480 obs. of 2 variables:

$ rt.sender : Factor w/ 44 levels “adol_izquierdo”,..: 6 4 37 10 8 42 5 41 38 26 …

$ rt.receiver: Factor w/ 30 levels “a3noticias”,”abc_es”,..: 1 13 19 16 4 20 8 21 7 29 …

m <- as.matrix(ddf)

head(m,10)

rt.sender rt.receiver

[1,] “aracelii92” “a3noticias”

[2,] “almudenamama” “fcapilargomez”

[3,] “razican” “la_informacion”

…..

rts.df.2 <- data.frame(rt.sender,rt.receiver)

rts.g <- graph.data.frame(rts.df.2, directed=T)

plot(rts.g ,vertex.size=4, vertex.label=V(rts.g)$name,vertex.color=”red”,vertex.label.color=”black”, vertex.frame.color=”white”, edge.color=”grey”,edge.arrow.size=0.01, rescale=TRUE, vertex.label=NA, vertex.label.dist=0.0,vertex.label.cex=0.5, add=FALSE, vertex.label.font=.001)

retweets_spanair

 

I did some tests with different terms during August 19th and August 20th 2013. Below find the resulting networks.

Retweets  “rajoy”

retweets_rajoy

Retweets “barcenas”

retweets_barcenas

 

Retweets “eshumorcom”

retweets_eshumorcom

Retweets “gif”

retweet_gif

Association Rules is a popular and well researched method for discovering interesting relations between variables in large databases. It is intended to identify strong rules discovered in databases using different measures of interestingness. Using the definition of “arules” R package:

—————————————–

Mining frequent itemsets and association rules is a popular and well researched method for discovering interesting relations between variables in large databases. Piatetsky-Shapiro (1991) describes analyzing and presenting strong rules discovered in databases using different measures of interestingness. Based on the concept of strong rules, Agrawal, Imielinski, and Swami (1993) introduced the problem of mining association rules from transaction data as follows:

I = {i1,i2,…,in}   -> set of n binary attributes called items

D = {t1,t2,…,tm}  -> set of transactions called the database

Each transaction in D has an unique transaction ID and contains a subset of the items in I. A rule is defined as an implication of the form X⇒Y where X,Y ⊆I and X∩Y =∅. The sets of items(for short itemsets) X and Y are called antecedent (left-hand-side or LHS) and consequent (right-hand-side or RHS) of the rule.

To illustrate the concepts, we use a small example from the supermarket domain. The set of items is I = {milk, bread, butter, beer} and a small database containing the items.

1 – milk, bread

2 – bread, butter

3 – beer

4 – milk, bread, butter

5 – bread, butter

An example rule for the supermarket could be {milk, bread} ⇒ {butter} meaning that if milk and bread is bought, customers also buy butter.

—————————————–

Below find a summary (see complete code here) of what you can do with R and association rules using its packages “arules” and “arulesViz”:

library(arules)

##### Epub example #####

data(“Epub”)

Epub

transactions in sparse format with 15729 transactions (rows) and 936 items (columns)

summary(Epub)

transactions as itemMatrix in sparse format with 15729 rows (elements/itemsets/transactions) and 936 columns (items) and a density of 0.001758755

includes extended item information – examples: labels 1 doc_11d 2 doc_13d 3 doc_14c

includes extended transaction information – examples: transactionID TimeStamp 10792 session_4795 2003-01-02 02:59:00

# size function
transactionInfo(Epub2003[size(Epub2003) > 20])

transactionID TimeStamp 11092 session_56e2 2003-04-29 19:30:38 11371 session_6308 2003-08-18 00:16:12

##### Arules Viz #####

library(arulesViz)
data(“Groceries”)
summary(Groceries)

transactions as itemMatrix in sparse format with 9835 rows (elements/itemsets/transactions) and 169 columns (items) and a density of 0.02609146

most frequent items: whole milk other vegetables rolls/buns soda yogurt 2513 1903 1809 1715 1372 (Other) 34055

# Mining association rules using the Apriori algorithm
rules <- apriori(Groceries, parameter=list(support=0.001, confidence=0.5))
rules

confidence minval smax arem aval originalSupport support minlen maxlen target ext 0.5 0.1 1 none FALSE TRUE 0.001 1 10 rules FALSE

algorithmic control: filter tree heap memopt load sort verbose 0.1 TRUE TRUE FALSE TRUE 2 TRUE

# Top three rules with respect to the lift measure
inspect(head(sort(rules, by =”lift”),3))

lhs rhs support confidence lift 1 {Instant food products, soda} => {hamburger meat} 0.001220132 0.6315789 18.99565 2 {soda, popcorn} => {salty snack} 0.001220132 0.6315789 16.69779 3 {flour, baking powder} => {sugar} 0.001016777 0.5555556 16.40807

# Plotting rules
plot(x, method = NULL, measure = “support”, shading = “lift”, interactive = FALSE, data)
plot(rules)

arules_1

plot(rules, measure=c(“support”, “lift”), shading=”confidence”)

arules_2

plot(rules, shading=”order”, control=list(main = “Two-key plot”))

arules_3

# Interactive plotting

sel <- plot(rules, measure=c(“support”, “lift”), shading=”confidence”, interactive=TRUE)

arules_4

arules_4_bis

# Matrix based visualizations
subrules <- rules[quality(rules)$confidence > 0.8]
subrules
plot(subrules, method=”matrix”, measure=”lift”)

arules_5

plot(subrules, method=”matrix”, measure=”lift”, control=list(reorder=TRUE))

arules_6

# Grouped matrix based visualizations
plot(rules, method=”grouped”)

arules_7

# Graph based visualizations

subrules2 <- head(sort(rules, by=”lift”), 10)
plot(subrules2, method=”graph”)

arules_8

plot(subrules2, method=”graph”,control=list(type=”items”))

arules_9

# Export graph as graphml
saveAsGraph(head(sort(rules, by=”lift”),1000), file=”rules.graphml”)

# Parallel coordinates
plot(subrules2, method=”paracoord”)

arules_10

plot(subrules2, method=”paracoord”, control=list(reorder=TRUE))

arules_11

# Double decker plot
oneRule <- sample(rules, 1)
inspect(oneRule)

lhs rhs support confidence lift 1 {other vegetables, frozen vegetables, soda} => {whole milk} 0.001626843 0.5 1.956825

plot(oneRule, method=”doubledecker”, data = Groceries)

arules_12

 

 

 

The Spanish Congress of Deputies is processing a transparency law of public information. In the legislative process, a group of independent experts appeared on the Spanish deputies to convey his vision of how it should legislate in this area. This is an analysis of the texts of the 24 appearances of independent experts in the Spanish Congress of Deputies. The analysis is focused on the frequency of words, leaving aside other analysis techniques such as POS tag or clustering. Code available on GitHub and data here.

# libraries

library(tm)
library(Snowball)
library(ggplot2)
library(reshape)

# Load text files and build a corpus (text document collection)

directory <- DirSource(directory=”~/Desktop/rstats/transp_texts”)
corpus <- Corpus(directory)

# Inspect the corpus

class(corpus)

[1] “VCorpus” “Corpus”  “list”
length(corpus)

[1] 24
corpus[[1]]

“x” “1” “He intentado acotar mi intervención lo máximo posible, espero que se entienda y si hay alguna duda intentaré aclararlo en las respuestas. En primer lugar, quiero agradecer la oportunidad de comparecer ante esta Comisión con motivo de la tramitación del proyecto de ley de transparencia, acceso a la información y buen gobierno. Entiendo que mi presencia aquí es precisamente por el tema que nos ocupa, el de la transparencia y el acceso a la información, que está estrechamente ligado a la protección de datos. Por ello, me permitiré compartir con ustedes algunas reflexiones sobre la relación entre estos dos derechos -el derecho de acceso a la información y el de protección de datos de carácter personal-, para a continuación incidir en algunos aspectos de la regulación contenida en el proyecto de ley que se está tramitando. Un profesor, que califica el derecho a la información como un derecho de tercera generación, sostiene que dicho derecho persigue convertir la Administración en una casa de cristal.”

# Refine the corpus

corpus <- tm_map(corpus,tolower)
corpus <- tm_map(corpus,removeNumbers)
corpus <- tm_map(corpus,stripWhitespace)
corpus <- tm_map(corpus,removeWords,stopwords(“spanish”))
corpus <- tm_map(corpus,removePunctuation,preserve_intra_word_dashes = FALSE)

# Build Term Document Matrix and Document Term Matrix

tdm <- TermDocumentMatrix(corpus)
class(tdm)

[1] “TermDocumentMatrix”    “simple_triplet_matrix”
dim(tdm)

[1] 6414   24
dtm <- DocumentTermMatrix(corpus)
class(dtm)

[1] “DocumentTermMatrix”    “simple_triplet_matrix”
dim(dtm)

[1]   24 6414

# Word Frequency
# Terms with a minimum frecuency

min_word_freq <- findFreqTerms(tdm, lowfreq=50)
length(min_word_freq)

[1] 61
min_word_freq

[1] “acceso” “administración” “administraciones” “ámbito” [5] “aquí” “archivos” “artículo” “así” [9] “buen” “caso” “ciudadanos” “comisión” [13] “cómo” “creo” “cualquier” “cuentas” [17] “datos” “debe” “decir” “derecho” [21] “derechos” “dice” “documentos” “ejemplo” [25] “españa” “forma” “fundamental” “general” [29] “gestión” “gobierno” “hecho” “importante” [33] “información” “interés” “ley” “lugar” [37] “materia” “mismo” “parece” “parte” [41] “partidos” “personas” “política” “políticos” [45] “principio” “procedimiento” “protección” “proyecto” [49] “pública” “públicas” “público” “públicos” [53] “relación” “silencio” “sino” “sociedad” [57] “tema” “todas” “transparencia” “tribunal” [61] “ustedes”

# Word associations: Find words correlated with a word

word_aso <- findAssocs(tdm, “transparencia”,0.6)
length(word_aso)

[1] 28

word_aso

word_aso_names <- attributes(word_aso)$names
df_word_aso <- as.data.frame(word_aso)
df_word_aso <- cbind(df_word_aso,word_aso_names)
df_word_aso

word_aso word_aso_names autónomas 0.75 autónomas auditoría 0.71 auditoría distintas 0.68 distintas hablado 0.67 hablado colación 0.64 colación dependiente 0.64 dependiente diputaciones 0.64 diputaciones índices 0.64 índices publicar 0.64 publicar
df_word_aso_t <- transform(df_word_aso,word_aso_names=reorder(word_aso_names,word_aso))
qplot(word_aso,word_aso_names,data=df_word_aso_t)

word_aso

# Frequency order
# Most frequent terms

top_freq_terms <- sort(rowSums(as.matrix(tdm)), decreasing=TRUE)
head(top_freq_terms,20)

df_top_freq_terms <- as.data.frame(top_freq_terms)
top_freq_terms_names <- attributes(top_freq_terms)$names
df_top_freq_terms <- cbind(df_top_freq_terms,top_freq_terms_names)
df_top_freq_terms_20 <-head(df_top_freq_terms,20)
df_top_freq_terms_t <- transform(df_top_freq_terms_20,top_freq_terms_names=reorder(top_freq_terms_names,top_freq_terms))
qplot(top_freq_terms,top_freq_terms_names,data=df_top_freq_terms_t)

word_freq

# Total words per author

total_word_per_author <- sort(colSums(as.matrix(tdm)), decreasing=TRUE)
head(total_word_per_author)
df_total_word_per_author <- as.data.frame(total_word_per_author)
total_word_per_author_names <- attributes(total_word_per_author)$names
df_total_word_per_author <- cbind(df_total_word_per_author,total_word_per_author_names)
le <- length(total_word_per_author_names)
head(df_total_word_per_author,le)
df_total_word_per_author_t <- transform(df_total_word_per_author,total_word_per_author_names=reorder(total_word_per_author_names,total_word_per_author))
qplot(total_word_per_author,total_word_per_author_names,data=df_total_word_per_author_t)

word_per_author

# Word matrices per author

matriz <- as.matrix(tdm[1:6414,0:24])
matriz_row_names <- row.names(matriz)
matriz_dframe <- as.data.frame(matriz)
matriz_dframe <- cbind(matriz_dframe,matriz_row_names)
str(matriz_dframe)
t(head(matriz_dframe))

# Sort matrices by one author (e.g Soledad Becerril)

df_order_by_author <- matriz_dframe[order(matriz_dframe$soledadbecerril.txt,decreasing=TRUE),]
head(df_order_by_author)
dim(df_order_by_author)
df_order_by_author_melt <- melt(df_order_by_author,id=”matriz_row_names”)
str(df_order_by_author_melt)
head(df_order_by_author_melt)

# Top words

df_order_by_author_top <- head(df_order_by_author,20)
str(df_order_by_author_top)
df_order_by_author_top_melt <- melt(df_order_by_author_top, id=”matriz_row_names”)
str(df_order_by_author_top_melt)
head(df_order_by_author_top_melt)
qplot(value,matriz_row_names,data=df_order_by_author_top_melt) + facet_wrap(~ variable)

word_per_given_author

qplot(value,variable,data=df_order_by_author_top_melt) + facet_wrap(~ matriz_row_names)

order_by_author

# Dictionaries

dic <- Dictionary(c(“transparencia”,”ley”,”información”,”derecho”,”acceso”,”datos”))
ma <- DocumentTermMatrix(corpus, list(dictionary = dic))
inspect(ma)
df <- as.data.frame((as.matrix(ma)))
str(df)
df
sort(total_word_per_author_names)
name_author <- sort(total_word_per_author_names)
df <- cbind(df,name_author)
df
df_total_word_per_author
names(df_total_word_per_author) <- c(“total_word_per_author”,”name_author”)
merge <- merge(df,df_total_word_per_author,by=”name_author”)
merge

# Plotting

qplot(acceso,name_author,data=merge)
qplot(datos,name_author,data=merge)
qplot(derecho,name_author,data=merge)
qplot(información,name_author,data=merge)
qplot(ley,name_author,data=merge)
qplot(transparencia,name_author,data=merge)
qplot(acceso/total_word_per_author,name_author,data=merge)
qplot(datos/total_word_per_author,name_author,data=merge)
qplot(derecho/total_word_per_author,name_author,data=merge)
qplot(información/total_word_per_author,name_author,data=merge)
qplot(ley/total_word_per_author,name_author,data=merge)
qplot(transparencia/total_word_per_author,name_author,data=merge)

acceso_author

datos_author

derecho_author

informacion_author

ley_author

transparencia_author

acceso_author_p

datos_author_p

derecho_author_p

information_author_p

 

ley_author_p

transparencia_author_p