354 lines
11 KiB
R
354 lines
11 KiB
R
##### Code Below #####
|
|
#Load Libraries
|
|
library("rvest")
|
|
library("tidyverse")
|
|
library("ggplot2")
|
|
library("wordcloud")
|
|
library("tidytext")
|
|
library("tinytex")
|
|
library("syuzhet")
|
|
library("lubridate")
|
|
library("scales")
|
|
library("reshape2")
|
|
library("dplyr")
|
|
|
|
# This scraping is getting all of the internal links
|
|
|
|
#Page 1 Scrape
|
|
pol_threads1 <- read_html("https://boards.4channel.org/pol/") %>%
|
|
html_elements("a") %>%
|
|
html_attr('href')
|
|
|
|
#Page 2 Scrape
|
|
pol_threads2 <- read_html("https://boards.4channel.org/pol/2") %>%
|
|
html_elements("a") %>%
|
|
html_attr('href')
|
|
|
|
#Page 3 Scrape
|
|
pol_threads3 <- read_html("https://boards.4channel.org/pol/3") %>%
|
|
html_elements("a") %>%
|
|
html_attr('href')
|
|
|
|
#Page 4 Scrape
|
|
pol_threads4 <- read_html("https://boards.4channel.org/pol/4") %>%
|
|
html_elements("a") %>%
|
|
html_attr('href')
|
|
|
|
|
|
#Page 5 Scrape
|
|
pol_threads5 <- read_html("https://boards.4channel.org/pol/5") %>%
|
|
html_elements("a") %>%
|
|
html_attr('href')
|
|
|
|
#Page 6 Scrape
|
|
pol_threads6 <- read_html("https://boards.4channel.org/pol/6") %>%
|
|
html_elements("a") %>%
|
|
html_attr('href')
|
|
|
|
#Page 7 Scrape
|
|
pol_threads7 <- read_html("https://boards.4channel.org/pol/7") %>%
|
|
html_elements("a") %>%
|
|
html_attr('href')
|
|
|
|
#Page 8 Scrape
|
|
pol_threads8 <- read_html("https://boards.4channel.org/pol/8") %>%
|
|
html_elements("a") %>%
|
|
html_attr('href')
|
|
|
|
#Page 9 Scrape
|
|
pol_threads9 <- read_html("https://boards.4channel.org/pol/9") %>%
|
|
html_elements("a") %>%
|
|
html_attr('href')
|
|
|
|
#Page 10 Scrape
|
|
pol_threads10 <- read_html("https://boards.4channel.org/pol/10") %>%
|
|
html_elements("a") %>%
|
|
html_attr('href')
|
|
|
|
# Combining all of the threads into 1 data frame
|
|
df_pol <- c(pol_threads1,
|
|
pol_threads2,
|
|
pol_threads3,
|
|
pol_threads4,
|
|
pol_threads5,
|
|
pol_threads6,
|
|
pol_threads7,
|
|
pol_threads8,
|
|
pol_threads9,
|
|
pol_threads10)
|
|
|
|
|
|
#tibble makes a table out of data from the scraped links
|
|
pol_table <- tibble(txt = df_pol)
|
|
|
|
|
|
# Choosing all of the links that look like: "thread/this-is-a-thread"
|
|
df_links <- pol_table %>%
|
|
filter(str_detect(txt, "(thread/[0-9]{6,}[/][a-z]{1,})"))
|
|
|
|
|
|
# Next step is appending on "https://boards.4chan.org/pol/" before the "thread/this-is-a-thread".
|
|
df_links$txt <- paste("https://boards.4chan.org/pol/", df_links$txt, sep = "")
|
|
|
|
|
|
# This code will "apply" the
|
|
# "read_html"
|
|
# "html_elements" and
|
|
# "html_text"
|
|
# to each row in the data frame
|
|
threads <- lapply(df_links$txt, function(x) {
|
|
read_html(x) %>%
|
|
html_text2()})
|
|
|
|
# Turn "threads" into a tibble so tidytex can manipulate it
|
|
threads_tibble <- tibble(txt = threads)
|
|
|
|
# Break up all of the sentences into single words
|
|
tidy_pol <- threads_tibble %>%
|
|
unnest_tokens(word, txt, format = "text")
|
|
|
|
|
|
tidy_pol_fixed <- tidy_pol %>%
|
|
filter(!word %in% stop_words$word
|
|
& !word == "fucking"
|
|
& !word == "https"
|
|
& !word == "shit"
|
|
& !word == "id"
|
|
& !word == "anonymous"
|
|
& !word == "wed"
|
|
& !word == "kb"
|
|
& !word == "var"
|
|
& !word == "png"
|
|
& !word == "mobile"
|
|
& !word == "mb"
|
|
& !word == "catalog"
|
|
& !word == "settings"
|
|
& !word == "display"
|
|
& !word == "advertise"
|
|
& !word == "pass"
|
|
& !word == "bottom"
|
|
& !word == "pol"
|
|
& !word == "shit"
|
|
& !word == "jpg"
|
|
& !word == "view"
|
|
& !word == "vp"
|
|
& !word == "ad"
|
|
& !word == "tv"
|
|
& !word == "fit"
|
|
& !word == "post"
|
|
& !word == "thread"
|
|
& !word == "hr"
|
|
& !word == "gif"
|
|
& !word == "webm"
|
|
& !word == "incorrect"
|
|
& !word == "tg"
|
|
& !word == "comments"
|
|
& !word == "search"
|
|
& !word == "top"
|
|
& !word == "site"
|
|
& !word == "home"
|
|
& !word == "reply"
|
|
& !word == "board"
|
|
& !word == "politically"
|
|
& !word == "return"
|
|
& !word == "time"
|
|
& !word == "owned"
|
|
& !word == "added"
|
|
& !word == "vip"
|
|
& !word == "users"
|
|
& !word == "rules"
|
|
& !word == "legal"
|
|
& !word == "lgbt"
|
|
& !word == "lit"
|
|
& !word == "file"
|
|
& !word == "mu"
|
|
& !word == "hide"
|
|
& !word == "fa"
|
|
& !word == "responsibility"
|
|
& !word == "style"
|
|
& !word == "options"
|
|
& !word == "table"
|
|
& !word == "page"
|
|
& !word == "serve"
|
|
& !word == "contact"
|
|
& !word == "images"
|
|
& !word == "international"
|
|
& !word == "poster"
|
|
& !word == "people"
|
|
& !word == "true"
|
|
& !word == "bant"
|
|
& !word == "vm"
|
|
& !word == "vmg"
|
|
& !word == "vrpg"
|
|
& !word == "vst"
|
|
& !word == "read"
|
|
& !word == "news"
|
|
& !word == "image"
|
|
& !word == "posts"
|
|
& !word == "jp"
|
|
& !word == "sci"
|
|
& !word == "vg"
|
|
& !word == "po"
|
|
& !word == "toy"
|
|
& !word == "vt"
|
|
& !word == "wg"
|
|
& !word == "biz"
|
|
& !word == "ck"
|
|
& !word == "desktop"
|
|
& !word == "enable"
|
|
& !word == "feedback"
|
|
& !word == "int"
|
|
& !word == "verification"
|
|
& !word == "respective"
|
|
& !word == "vr"
|
|
& !word == "wsg"
|
|
& !word == "aco"
|
|
& !word == "adv"
|
|
& !word == "delete"
|
|
& !word == "cm"
|
|
& !word == "disable"
|
|
& !word == "bfutababurichantomorrowphoton"
|
|
& !word == "cgl"
|
|
& !word == "comlen"
|
|
& !word == "cooldowns"
|
|
& !word == "copyrights"
|
|
& !word == "cssversion"
|
|
& !word == "diy"
|
|
& !word == "gd"
|
|
& !word == "hc"
|
|
& !word == "ic"
|
|
& !word == "incorrectreturn"
|
|
& !word == "jsversion"
|
|
& !word == "maxfilesize"
|
|
& !word == "maxlines"
|
|
& !word == "mlp"
|
|
& !word == "payment"
|
|
& !word == "postform"
|
|
& !word == "pw"
|
|
& !word == "qa"
|
|
& !word == "qst"
|
|
& !word == "recaptcha"
|
|
& !word == "refresh"
|
|
& !word == "replyreturn"
|
|
& !word == "soc"
|
|
& !word == "sp"
|
|
& !word == "trademarks"
|
|
& !word == "trv"
|
|
& !word == "uploaded"
|
|
& !word == "hm"
|
|
& !word == "xs"
|
|
& !word == "yotsubayotsuba"
|
|
& !word == "boards"
|
|
& !word == "faq"
|
|
& !word == "announcementcrypto"
|
|
& !word == "bolsheviknatonazihippiepiraterepublicantask"
|
|
& !word == "bypass"
|
|
& !word == "capitalistanarchistblack"
|
|
& !word == "flaggeographic"
|
|
& !word == "huggerunited"
|
|
& !word == "locationanarcho"
|
|
& !word == "login"
|
|
& !word == "nationalistconfederatecommunistcataloniademocrateuropeanfascistgadsdengayjihadikekistanimuslimnational"
|
|
& !word == "nationswhite"
|
|
& !word == "refreshpost"
|
|
& !word == "supremacistfileplease"
|
|
& !word == "ztemplartree"
|
|
& !word == "posters"
|
|
& !word == "wpjizlog"
|
|
& !word == "xxfbsv"
|
|
& !word == "wsr"
|
|
& !word == "mon"
|
|
& !word == "tue"
|
|
& !word == "wed"
|
|
& !word == "thu"
|
|
& !word == "fri"
|
|
& !word == "sat"
|
|
& !word == "sun"
|
|
& !word == "tues"
|
|
& !grepl("[^A-Za-z]", word))
|
|
|
|
#below will replace a word with another word
|
|
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "niggers", "nigger")
|
|
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "jews", "jew")
|
|
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "jewish", "jew")
|
|
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "woman", "women")
|
|
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "fucked", "fuck")
|
|
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "fucks", "fuck")
|
|
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "fuckers", "fuck")
|
|
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "fuckin", "fuck")
|
|
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "fucker", "fuck")
|
|
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "retards", "retard")
|
|
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "retarded", "retard")
|
|
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "retardation", "retard")
|
|
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "campaings", "campaign")
|
|
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "posted", "posting")
|
|
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "shitskins", "nigger")
|
|
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "shitskin", "nigger")
|
|
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "christians", "christian")
|
|
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "israeli", "israel")
|
|
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "guy", "guys")
|
|
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "whites", "white")
|
|
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "indians", "indian")
|
|
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "poos", "poo")
|
|
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "parties", "party")
|
|
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "kikes", "jew")
|
|
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "kike", "jew")
|
|
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "glow", "glowie")
|
|
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "glowies", "glowie")
|
|
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "glows", "glowie")
|
|
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "glowfags", "glowie")
|
|
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "glowshills", "glowie")
|
|
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "guyss", "guy")
|
|
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "guys", "guy")
|
|
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "shills", "shill")
|
|
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "shilling", "shill")
|
|
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "shilled", "shill")
|
|
|
|
tidy_pol_fixed2 <- tidy_pol_fixed %>%
|
|
count(word, sort = TRUE) %>%
|
|
print(n = 10)
|
|
|
|
# Goodness of fit test ($\chi^2$). p-value < 0.05 = significant result that the
|
|
# observations are indepenent and vary in probabilities of showing up.
|
|
|
|
top <- subset(tidy_pol_fixed2, tidy_pol_fixed2$n >= 200)
|
|
chi <- chisq.test(top$n)
|
|
options(scipen = 999) # removes scientific notation
|
|
chi
|
|
|
|
|
|
# Time to Visualize
|
|
|
|
tidy_pol_fixed2 %>%
|
|
top_n(50) %>%
|
|
mutate(word = reorder(word, n)) %>%
|
|
ggplot(aes(word, n, fill = n)) +
|
|
scale_fill_distiller( palette = "Greens") +
|
|
geom_col() +
|
|
labs(title = "Most Used Words",
|
|
x = "Words",
|
|
y = "Count",
|
|
fill = "Results") +
|
|
coord_flip() +
|
|
theme_dark(base_size = 12.5)
|
|
|
|
tidy_pol_fixed2 %>%
|
|
with(wordcloud(word, n, max.words = 100, random.order = FALSE, rot.per = 0.0,
|
|
colors = brewer.pal(8, "Dark2")))
|
|
|
|
# Sentiment analysis, and visuals with a bar grapoh.
|
|
# This will take a few minutes to process
|
|
# On my Ryzen 7 3700X, and 16Gb of RAM, it took about 5 minutes.
|
|
sentiment_tidy_pol_fixed_2 <- get_nrc_sentiment(tidy_pol_fixed2$word)
|
|
|
|
barplot(colSums(sentiment_tidy_pol_fixed_2),
|
|
las = 2,
|
|
col = rainbow(10),
|
|
ylab = 'Count',
|
|
main = 'Pol Sentiment Scores')
|
|
|
|
# Time to Save the Data
|
|
timestamp <- format(Sys.time(), "%b %d %Y %X")
|
|
filename <- paste0("~/Documents/Stats/4Chan Scraper/",timestamp,".csv")
|
|
write.csv(tidy_pol_fixed2, file = filename)
|