Ngram Difference Between Days Script
This commit is contained in:
parent
f0e289f9c7
commit
1adaae90ab
|
|
@ -0,0 +1,97 @@
|
|||
# Load Libraries
|
||||
library("ggplot2")
|
||||
library("tidyverse")
|
||||
library("dplyr")
|
||||
# For the %>% operator, but you can
|
||||
# skip loading tidyverse
|
||||
# and just use `|>` as
|
||||
# a pipe operator.
|
||||
|
||||
|
||||
# Note: Other code is below to do an alternative method,
|
||||
# but the uncommented method is superior.
|
||||
|
||||
#load CSVs using code.
|
||||
df1 <- read.csv("~/Documents/Stats/4ChanScraper/ngram Aug 31 2023 10:49:52.csv")
|
||||
df2 <- read.csv("~/Documents/Stats/4ChanScraper/ngram Aug 31 2023 12:08:28.csv")
|
||||
|
||||
|
||||
# Merge data frame, and take difference b/w day 1 and day 2
|
||||
# subtracting data frames from each other.
|
||||
# X.x = df1
|
||||
# X.y = df2
|
||||
df_merged <- merge(df1, df2, by="word", all=TRUE)
|
||||
# For ngram below
|
||||
df_merged$result <- df_merged$X.y - df_merged$X.x
|
||||
|
||||
|
||||
# Feel free to add more "non-words," or "noise"
|
||||
# to this list as you see fit.
|
||||
df_difference_filter <- df_merged %>%
|
||||
filter(!word == "de"
|
||||
& !word == "je"
|
||||
& !word == "een"
|
||||
& !word == "dat"
|
||||
& !word == "en"
|
||||
& !word == "eu"
|
||||
& !word == "te"
|
||||
& !word == "tu"
|
||||
& !word == "niet"
|
||||
& !word == "van"
|
||||
& !word == "niet"
|
||||
& !word == "ik"
|
||||
& !word == "ze"
|
||||
& !word == "om"
|
||||
& !word == "met"
|
||||
& !word == "uk"
|
||||
& !word == "qt"
|
||||
& !word == "wat"
|
||||
& !word == "bb"
|
||||
& !word == "op"
|
||||
& !word == "ne"
|
||||
& !word == "rh")
|
||||
|
||||
# assign NA to Zero
|
||||
df_difference_filter$result[is.na(df_difference_filter$result)] <- 0
|
||||
|
||||
|
||||
# Get bottom 20 (negative) numbers
|
||||
df_bottom <- df_difference_filter %>%
|
||||
top_n(-20)
|
||||
|
||||
# Get top 20 (positive) numbers
|
||||
df_top <- df_difference_filter %>%
|
||||
top_n(20)
|
||||
|
||||
# Bind into new data frame
|
||||
df_merged2 <- rbind(df_top, df_bottom)
|
||||
|
||||
|
||||
# Colours results based on conditions.
|
||||
ngram_fill_bar <- case_when(
|
||||
df_merged2$result < -22000 ~ "red4",
|
||||
df_merged2$result <= -20000 ~ "red1",
|
||||
df_merged2$result <= -19000 ~ "darkgreen",
|
||||
df_merged2$result <= -18000 ~ "seagreen",
|
||||
df_merged2$result <= 0 ~ "black",
|
||||
df_merged2$result <= 20000 ~ "slategray4",
|
||||
df_merged2$result <= 21000 ~ "slateblue",
|
||||
df_merged2$result <= 23000 ~ "steelblue3",
|
||||
df_merged2$result <= 25000 ~ "cyan4")
|
||||
|
||||
|
||||
# bar graph of difference between Day 2, and Day 1.
|
||||
df_merged2 %>%
|
||||
top_n(50) %>%
|
||||
mutate(word = reorder(word, result)) %>%
|
||||
ggplot(aes(word, result)) +
|
||||
theme(legend.position = "none", axis.title.y = element_blank()) +
|
||||
geom_bar(stat = "identity", fill = ngram_fill_bar) +
|
||||
labs(
|
||||
title = "Difference of Word Pairs from Today - Yesterday",
|
||||
x = "Words",
|
||||
y = "Count",
|
||||
caption = "Positive integers = More mentions today.
|
||||
Negative integers = Less mentions today.") +
|
||||
coord_flip() +
|
||||
theme_dark(base_size = 13)
|
||||
Loading…
Reference in New Issue