X Number of Posts by this ID
This commit is contained in:
parent
82070015e3
commit
f53ad7fbbc
|
|
@ -0,0 +1,169 @@
|
|||
# Load Libraries
|
||||
library("rvest")
|
||||
library("tidyverse")
|
||||
library("ggplot2")
|
||||
library("wordcloud")
|
||||
library("tidytext")
|
||||
library("tinytex")
|
||||
library("syuzhet")
|
||||
library("lubridate")
|
||||
library("scales")
|
||||
library("reshape2")
|
||||
library("dplyr")
|
||||
|
||||
# This scraping is getting all of the internal links
|
||||
|
||||
#Page 1 Scrape
|
||||
pol_threads1 <- read_html("https://boards.4channel.org/pol/") %>%
|
||||
html_elements("a") %>%
|
||||
html_attr('href')
|
||||
|
||||
#Page 2 Scrape
|
||||
pol_threads2 <- read_html("https://boards.4channel.org/pol/2") %>%
|
||||
html_elements("a") %>%
|
||||
html_attr('href')
|
||||
|
||||
#Page 3 Scrape
|
||||
pol_threads3 <- read_html("https://boards.4channel.org/pol/3") %>%
|
||||
html_elements("a") %>%
|
||||
html_attr('href')
|
||||
|
||||
#Page 4 Scrape
|
||||
pol_threads4 <- read_html("https://boards.4channel.org/pol/4") %>%
|
||||
html_elements("a") %>%
|
||||
html_attr('href')
|
||||
|
||||
|
||||
#Page 5 Scrape
|
||||
pol_threads5 <- read_html("https://boards.4channel.org/pol/5") %>%
|
||||
html_elements("a") %>%
|
||||
html_attr('href')
|
||||
|
||||
#Page 6 Scrape
|
||||
pol_threads6 <- read_html("https://boards.4channel.org/pol/6") %>%
|
||||
html_elements("a") %>%
|
||||
html_attr('href')
|
||||
|
||||
#Page 7 Scrape
|
||||
pol_threads7 <- read_html("https://boards.4channel.org/pol/7") %>%
|
||||
html_elements("a") %>%
|
||||
html_attr('href')
|
||||
|
||||
#Page 8 Scrape
|
||||
pol_threads8 <- read_html("https://boards.4channel.org/pol/8") %>%
|
||||
html_elements("a") %>%
|
||||
html_attr('href')
|
||||
|
||||
#Page 9 Scrape
|
||||
pol_threads9 <- read_html("https://boards.4channel.org/pol/9") %>%
|
||||
html_elements("a") %>%
|
||||
html_attr('href')
|
||||
|
||||
#Page 10 Scrape
|
||||
pol_threads10 <- read_html("https://boards.4channel.org/pol/10") %>%
|
||||
html_elements("a") %>%
|
||||
html_attr('href')
|
||||
|
||||
# Combining all of the threads into 1 Vector
|
||||
df_pol <- c(pol_threads1,
|
||||
pol_threads2,
|
||||
pol_threads3,
|
||||
pol_threads4,
|
||||
pol_threads5,
|
||||
pol_threads6,
|
||||
pol_threads7,
|
||||
pol_threads8,
|
||||
pol_threads9,
|
||||
pol_threads10)
|
||||
|
||||
#tibble makes a table out of data from the scraped links
|
||||
pol_table <- tibble(txt = df_pol)
|
||||
|
||||
# Choosing all of the links that look like: "4chan.org/pol/thread/this-is-a-thread/"
|
||||
df_links <- pol_table %>%
|
||||
filter(str_detect(txt, "(thread/[0-9]{6,}[/][a-z]{1,})"))
|
||||
|
||||
# Next step is appending on "https://boards.4chan.org/pol/" before the "thread/this-is-a-thread".
|
||||
df_links$txt <- paste("https://boards.4chan.org/pol/", df_links$txt, sep = "")
|
||||
|
||||
# This code will "apply" the
|
||||
# "read_html"
|
||||
# "html_elements" and
|
||||
# "html_attr"
|
||||
# to each row in the data frame
|
||||
threads <- lapply(df_links$txt, function(x) {
|
||||
read_html(x) %>%
|
||||
html_elements("span") %>%
|
||||
html_attr("class")})
|
||||
|
||||
# This gets all class elements from HTML and puts them under 1 column
|
||||
threads_rbind <- do.call(rbind.data.frame, threads)
|
||||
threads_col <- tibble(txt = threads_rbind)
|
||||
threads_col_all <-data.frame(txt = c(t(threads_col)), stringsAsFactors=FALSE)
|
||||
|
||||
# Break up all of the sentences into single words
|
||||
# Also filters everything except: example -> "id_A1b2c45d"
|
||||
tidy_pol_IDs <- threads_col_all %>%
|
||||
unnest_tokens(word, txt, format = "text") %>%
|
||||
filter(str_detect(word, "(id_[a-zA-z0-9]{8})"))
|
||||
|
||||
|
||||
# === Import CSV as tidy_pol_IDs ===
|
||||
# tidy_pol_IDs <- read.csv("~/Documents/Stats/4ChanScraper/Aug 28 2023 06:01:27 PM.csv")
|
||||
|
||||
# Shows the top 10 IDs, and displays number
|
||||
# of their posts for all threads.
|
||||
tidy_pol_sorted_IDs <- tidy_pol_IDs %>%
|
||||
count(word, sort = TRUE)
|
||||
|
||||
# Useless code, but shows the bottom 10 IDs
|
||||
# A few hundred posters only post twice.
|
||||
tidy_pol_bottom_10_IDs <- tidy_pol_IDs %>%
|
||||
count(word, sort = TRUE) %>%
|
||||
tail(50)
|
||||
|
||||
# Counts all occurances of the unique ID showing up in threads
|
||||
count_by_IDs <- tidy_pol_IDs %>%
|
||||
count(word, sort = TRUE) %>%
|
||||
tail(50)
|
||||
|
||||
# Combines all counts of each poster's post
|
||||
# e.g., Unique IDs posted twice,
|
||||
# for a total of 500 times across all threads
|
||||
# i.e. 2bptid
|
||||
count_by_n <- tidy_pol_sorted_IDs %>%
|
||||
count(n, sort = TRUE)
|
||||
|
||||
|
||||
# =========== Time to Visualize ===========
|
||||
|
||||
# Plots the biggest contributers for posts
|
||||
# in all threads
|
||||
tidy_pol_sorted_IDs %>%
|
||||
top_n(30) %>%
|
||||
mutate(word = reorder(word, n)) %>%
|
||||
ggplot(aes(word, n)) +
|
||||
geom_col(fill="darkblue") +
|
||||
xlab("Words") +
|
||||
ylab("Count") +
|
||||
coord_flip() +
|
||||
theme_dark()
|
||||
|
||||
# Plots total observation for X posts by this ID
|
||||
# Example:
|
||||
# plots X number of observations for 2 posts by this ID
|
||||
# Then it orders the top 20 frequencies starting from zero.
|
||||
count_by_n %>%
|
||||
top_n(20) %>%
|
||||
mutate(reorder(n, nn)) %>%
|
||||
ggplot(aes(n, nn)) +
|
||||
geom_col(fill = "darkblue") +
|
||||
xlab("Number of Posts per ID") +
|
||||
ylab("Observed Counts of X posts") +
|
||||
coord_flip() +
|
||||
theme_dark()
|
||||
|
||||
# Time to Save the Data
|
||||
timestamp <- format(Sys.time(), "%b %d %Y %X")
|
||||
filename <- paste0("~/Documents/Stats/4ChanScraper/Pol-IDs-",timestamp,".csv")
|
||||
write.csv(tidy_pol_IDs, file = filename)
|
||||
Loading…
Reference in New Issue