4Chan-Web-Scraper-v2/scripts/4ChanScraperv2.R

##### Code Below #####
#Load Libraries
library("rvest")
library("tidyverse")
library("ggplot2")
library("wordcloud")
library("tidytext")
library("tinytex")
library("syuzhet")
library("lubridate")
library("scales")
library("reshape2")
library("dplyr")
library("qdapDictionaries")
# library("tm") Dont need
# library("qdap") Dont need
# This scraping is getting all of the internal links

#Page 1 Scrape
pol_threads1 <- read_html("https://boards.4channel.org/pol/") %>%
  html_elements("a") %>%
  html_attr('href')

#Page 2 Scrape
pol_threads2 <- read_html("https://boards.4channel.org/pol/2") %>%
  html_elements("a") %>%
  html_attr('href')

#Page 3 Scrape
pol_threads3 <- read_html("https://boards.4channel.org/pol/3") %>%
  html_elements("a") %>%
  html_attr('href')

#Page 4 Scrape
pol_threads4 <- read_html("https://boards.4channel.org/pol/4") %>%
  html_elements("a") %>%
  html_attr('href')


#Page 5 Scrape
pol_threads5 <- read_html("https://boards.4channel.org/pol/5") %>%
  html_elements("a") %>%
  html_attr('href')

#Page 6 Scrape
pol_threads6 <- read_html("https://boards.4channel.org/pol/6") %>%
  html_elements("a") %>%
  html_attr('href')

#Page 7 Scrape
pol_threads7 <- read_html("https://boards.4channel.org/pol/7") %>%
  html_elements("a") %>%
  html_attr('href')

#Page 8 Scrape
pol_threads8 <- read_html("https://boards.4channel.org/pol/8") %>%
  html_elements("a") %>%
  html_attr('href')

#Page 9 Scrape
pol_threads9 <- read_html("https://boards.4channel.org/pol/9") %>%
  html_elements("a") %>%
  html_attr('href')

#Page 10 Scrape
pol_threads10 <- read_html("https://boards.4channel.org/pol/10") %>%
  html_elements("a") %>%
  html_attr('href')

# get_hrefs <- . %>% read_html %>% html_elements("a") %>%
#   html_attr('href')
# paste("https://boards.4channel.org/pol/", 1:9) %>% map_chr(get_hrefs)

# Combining all of the threads into 1 data frame
df_pol <-      c(pol_threads1,
                 pol_threads2,
                 pol_threads3,
                 pol_threads4,
                 pol_threads5,
                 pol_threads6,
                 pol_threads7,
                 pol_threads8,
                 pol_threads9,
                 pol_threads10)


#tibble makes a table out of data from the scraped links
pol_table <- tibble(txt = df_pol)


# Choosing all of the links that look like: "thread/this-is-a-thread"
df_links <- pol_table %>%
  filter(str_detect(txt, "(thread/[0-9]{6,}[/][a-z]{1,})"))


# Next step is appending on "https://boards.4chan.org/pol/" before the "thread/this-is-a-thread".
df_links$txt <- paste("https://boards.4chan.org/pol/", df_links$txt, sep = "")


# This code will "apply" the
# "read_html"
# "html_elements" and
# "html_text"
# to each row in the data frame
threads <- lapply(df_links$txt, function(x) {
  read_html(x) %>%
    html_text2()})

# Turn "threads" into a tibble so tidytex can manipulate it
threads_tibble <- tibble(txt = threads)

# Break up all of the sentences into single words
tidy_pol <- threads_tibble %>%
  unnest_tokens(word, txt, format = "text")


# Doesn't perform well
# https://github.com/ccs-amsterdam/r-course-material/blob/master/tutorials/R_text_3_quanteda.md
# tidy_pol <- removeNumbers(tidy_pol$word) %>%
#   removePunctuation() %>%
#   stripWhitespace()
# threads_tibble <- tibble(txt = tidy_pol)
# tidy_pol <- threads_tibble %>%
#   unnest_tokens(word, txt, format = "text")
# mutate(token_stem = wordStem(word))
# tidy_pol_test <- as.list(tidy_pol)
# tidy_pol_test2 <- tokens(tidy_pol_test)
# tidy_pol_test3 <- tokens_wordstem(tidy_pol_test2)
# tokens_rbind <- do.call(rbind, tidy_pol_test3)
# tokens_col <- tibble(txt = tokens_rbind)
# tokens_col_all <-data.frame(txt = c(t(tokens_col)), stringsAsFactors=FALSE)


# Put this in the ngram script
#filter(!grepl('[0-9]', word))


tidy_pol_fixed <- tidy_pol %>%
  filter(!word %in% stop_words$word
         & !word == "fucking"
         & !word == "https"
         & !word == "shit"
         & !word == "id"
         & !word == "anonymous"
         & !word == "wed"
         & !word == "kb"
         & !word == "var"
         & !word == "png"
         & !word == "mobile"
         & !word == "mb"
         & !word == "catalog"
         & !word == "settings"
         & !word == "display"
         & !word == "advertise"
         & !word == "pass"
         & !word == "bottom"
         & !word == "pol"
         & !word == "shit"
         & !word == "jpg"
         & !word == "view"
         & !word == "vp"
         & !word == "ad"
         & !word == "tv"
         & !word == "fit"
         & !word == "post"
         & !word == "thread"
         & !word == "hr"
         & !word == "gif"
         & !word == "webm"
         & !word == "incorrect"
         & !word == "tg"
         & !word == "comments"
         & !word == "search"
         & !word == "top"
         & !word == "site"
         & !word == "home"
         & !word == "reply"
         & !word == "board"
         & !word == "politically"
         & !word == "return"
         & !word == "time"
         & !word == "owned"
         & !word == "added"
         & !word == "vip"
         & !word == "users"
         & !word == "rules"
         & !word == "legal"
         & !word == "lgbt"
         & !word == "lit"
         & !word == "file"
         & !word == "mu"
         & !word == "hide"
         & !word == "fa"
         & !word == "responsibility"
         & !word == "style"
         & !word == "options"
         & !word == "table"
         & !word == "page"
         & !word == "serve"
         & !word == "contact"
         & !word == "images"
         & !word == "international"
         & !word == "poster"
         & !word == "people"
         & !word == "true"
         & !word == "bant"
         & !word == "vm"
         & !word == "vmg"
         & !word == "vrpg"
         & !word == "vst"
         & !word == "read"
         & !word == "news"
         & !word == "image"
         & !word == "posts"
         & !word == "jp"
         & !word == "sci"
         & !word == "vg"
         & !word == "po"
         & !word == "toy"
         & !word == "vt"
         & !word == "wg"
         & !word == "biz"
         & !word == "ck"
         & !word == "desktop"
         & !word == "enable"
         & !word == "feedback"
         & !word == "int"
         & !word == "verification"
         & !word == "respective"
         & !word == "vr"
         & !word == "wsg"
         & !word == "aco"
         & !word == "adv"
         & !word == "delete"
         & !word == "cm"
         & !word == "disable"
         & !word == "bfutababurichantomorrowphoton"
         & !word == "cgl"
         & !word == "comlen"
         & !word == "cooldowns"
         & !word == "copyrights"
         & !word == "cssversion"
         & !word == "diy"
         & !word == "gd"
         & !word == "hc"
         & !word == "ic"
         & !word == "incorrectreturn"
         & !word == "jsversion"
         & !word == "maxfilesize"
         & !word == "maxlines"
         & !word == "mlp"
         & !word == "payment"
         & !word == "postform"
         & !word == "pw"
         & !word == "qa"
         & !word == "qst"
         & !word == "recaptcha"
         & !word == "refresh"
         & !word == "replyreturn"
         & !word == "soc"
         & !word == "sp"
         & !word == "trademarks"
         & !word == "trv"
         & !word == "uploaded"
         & !word == "hm"
         & !word == "xs"
         & !word == "yotsubayotsuba"
         & !word == "boards"
         & !word == "faq"
         & !word == "announcementcrypto"
         & !word == "bolsheviknatonazihippiepiraterepublicantask"
         & !word == "bypass"
         & !word == "capitalistanarchistblack"
         & !word == "flaggeographic"
         & !word == "huggerunited"
         & !word == "locationanarcho"
         & !word == "login"
         & !word == "nationalistconfederatecommunistcataloniademocrateuropeanfascistgadsdengayjihadikekistanimuslimnational"
         & !word == "nationswhite"
         & !word == "refreshpost"
         & !word == "supremacistfileplease"
         & !word == "ztemplartree"
         & !word == "posters"
         & !word == "wpjizlog"
         & !word == "xxfbsv"
         & !word == "wsr"
         & !word == "mon"
         & !word == "tue"
         & !word == "wed"
         & !word == "thu"
         & !word == "fri"
         & !word == "sat"
         & !word == "sun"
         & !word == "tues"
         & !word == "emyqupza"
         & !word == "nlcbzjyk"
         & !word == "oq"
         & !word == "avqxwczz"
         & !word == "lot"
         & !word == "day"
         & !word == "guy"
         & !word == "announcement"
         & !word == "yeah"
         & !word == "anon"
         & !word == "watch"
         & !grepl("[^A-Za-z]", word)
         & !word == "[a-zA-Z]{,2}"
         & word %in% GradyAugmented)

#below will replace a word with another word
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "niggers", "nigger")
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "jews", "jew")
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "jewish", "jew")
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "woman", "women")
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "fucked", "fuck")
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "fucks", "fuck")
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "fuckers", "fuck")
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "fuckin", "fuck")
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "fucker", "fuck")
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "retards", "retard")
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "retarded", "retard")
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "retardation", "retard")
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "campaings", "campaign")
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "posted", "posting")
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "shitskins", "nigger")
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "shitskin", "nigger")
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "christians", "christian")
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "israeli", "israel")
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "guy", "guys")
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "whites", "white")
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "indians", "indian")
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "poos", "poo")
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "parties", "party")
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "kikes", "jew")
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "kike", "jew")
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "glow", "glowie")
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "glowies", "glowie")
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "glows", "glowie")
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "glowfags", "glowie")
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "glowshills", "glowie")
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "guyss", "guy")
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "guys", "guy")
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "shills", "shill")
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "shilling", "shill")
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "shilled", "shill")
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "europeans", "european")
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "catholics", "catholic")
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "masks", "mask")
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "russians", "russian")
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "indians", "indian")
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "catholics", "catholic")
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "chinks", "chinese")
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "chink", "chinese")
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "gook", "japanese")
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "gooks", "japanese")
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "woman", "women")
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "parties", "party")
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "governments", "government")
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "american", "america")
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "faggots", "faggot")
tidy_pol_fixed$word <- str_replace(tidy_pol_fixed$word, "campaigns", "campaign")

# Add date
date <- Sys.Date()

tidy_pol_fixed2 <- tidy_pol_fixed %>%
  count(word, sort = TRUE) %>%
  print(n = 20) %>%
  cbind(date)

# Goodness of fit test ($\chi^2$). p-value < 0.05 = significant result that the
# observations are indepenent and vary in probabilities of showing up.

# top <- subset(tidy_pol_fixed2, tidy_pol_fixed2$n >= 200)
# chi <- chisq.test(top$n)
# options(scipen = 999) # removes scientific notation
# chi


# =========== Time to Visualize ===========

# Import CSV as tidy_pol_fixed2
#tidy_pol_fixed2 <- read.csv("~/Documents/Stats/4ChanScraper/csv/Aug 30 2023 18:00:58.csv")

#experimenting with removing top words
tidy_pol_fixed2 <- tidy_pol_fixed2 %>%
  filter(!word == "jew"
         & !word == "nigger"
         & !word == "white"
         & !word == "fuck"
         & !word == "women"
         & !word == "faggot"
         & !word == "world"
         & !word == "posting"
         & !word == "retard"
         & !word == "real"
         & !word == "learn"
         & !word == "party"
         & !word == "retard"
         & !word == "country"
         & !word == "america"
         & !word == "life")

tidy_pol_fixed2 %>%
  top_n(50) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = n)) +
  scale_fill_distiller( palette = "Greens") +
  geom_col() +
  labs(title = "Most Used Words",
       x = "Words",
       y = "Count",
       fill = "Results") +
  coord_flip() +
  theme_dark(base_size = 12.5)

tidy_pol_fixed2 %>%
  with(wordcloud(word, n, max.words = 150, random.order = FALSE, rot.per = 0.0,
                 colors = brewer.pal(8, "Dark2")))

# Sentiment analysis, and visuals with a bar grapoh.
# This will take a few minutes to process
# On my Ryzen 7 3700X, and 16Gb of RAM, it took about 5 minutes.
#sentiment_tidy_pol_fixed_2 <- get_nrc_sentiment(tidy_pol_fixed2$word)

# barplot(colSums(sentiment_tidy_pol_fixed_2),
#         las = 2,
#         col = rainbow(10),
#         ylab = 'Count',
#         main = 'Pol Sentiment Scores')

# Time to Save the Data
timestamp <- format(Sys.time(), "%b %d %Y %X")
filename <- paste0("~/Documents/Stats/4ChanScraper/",timestamp,".csv")
write.csv(tidy_pol_fixed2, file = filename)