##### Load Package #####
library(edgar)
library(stringr)
library(lubridate)
library(edgarWebR)
library(dplyr)
library(tidyr)
library(textshape)
library(readr)
##### Download 10-K from EDGAR #####
## set directory
setwd("/Users/yeqiaozhi/Dropbox/project/data/") #enter your own directory here
get_10_k <- getFilingsHTML(cik.no = "ALL", #enter companys' cik or use "ALL"
form.type = "10-K",
filing.year = 2022, #enter filing year, e.g., 2022
quarter = c(1, 2, 3, 4),
useragent = "xxx@xxx.com" #enter your email address
)
##### Extract filter status and public float #####
## create an empty dataset to save results
parse_df <- data.frame()
## extract all cik inside the folder that stores the 10-K HTML files.
cik_list <- list.files("/Users/yeqiaozhi/Dropbox/project/data/Edgar filings_HTML view/Form 10-K/")
## extract filer status and public float using iteration
for (i in 1:length(cik_list)){
## get all 10-K file name under a cik folder
file_name <- list.files(paste0("/Users/yeqiaozhi/Dropbox/project/data/Edgar filings_HTML view/Form 10-K/",cik_list[i],"/"))
for (w in 1:length(file_name)){
## get the full 10-K file path
file <- paste0("/Users/yeqiaozhi/Dropbox/project/data/Edgar filings_HTML view/Form 10-K/",cik_list[i],"/",file_name[w]) #replace "/Users/yeqiaozhi/Dropbox/project/data/" with your own location.
## get filing date
file_date <- as.Date(str_remove_all(str_extract(file,"10-K_.{10}"),"10-K_"),"%Y-%m-%d")
## read and get the first 10,000 characters from the 10-K file. Filer status and public float are usually shown in the front page so the first 10,000 characters should be able to capture the information.
script <- tolower(substr(parse_filing(file, strip = TRUE, include.raw = FALSE, fix.errors = TRUE)[1],1,10000))
## remove all whitespace
script_nospace <- gsub(x = gsub("\\s+", "", script), pattern = intToUtf8(160),replacement = "")
## extract prior 35 characters before the symbols indicating "yes". Filer status usually presents before one of them.
symbol_1 <- data.frame(str_extract_all(script_nospace,".{30}☒")) %>% rename(words=1)
symbol_2 <- data.frame(str_extract_all(script_nospace,".{30}[X]")) %>% rename(words=1)
symbol_3 <- data.frame(str_extract_all(script_nospace,".{30}[x]")) %>% rename(words=1)
symbol_4 <- data.frame(str_extract_all(script_nospace,".{30}☑")) %>% rename(words=1)
symbol_5 <- data.frame(str_extract_all(script_nospace,".{30}⌧")) %>% rename(words=1)
symbol_6 <- data.frame(str_extract_all(script_nospace,".{30}þ")) %>% rename(words=1)
symbol_7 <- data.frame(str_extract_all(script_nospace,".{30}ý")) %>% rename(words=1)
## combine all extracted words and find the filer status
filer_status <- bind_rows(symbol_1,symbol_2,symbol_3,symbol_4,symbol_5,symbol_6,symbol_7) %>%
mutate(words=str_remove_all(words," "),
status=case_when(str_detect(words,"reporting") & str_detect(words,"company") ~"src",
str_detect(words,"large") & str_detect(words,"accelerated")~"laf",
str_detect(words,"accelerated") & !str_detect(words,"large") & !str_detect(words,"non-")~"af",
str_detect(words,"non-accelerated") ~"naf",
str_detect(words,"growth") & str_detect(words,"company")~"egc")) %>%
filter(!is.na(status)) %>%
summarise(status=paste(status,collapse = ",")) %>%
mutate(src=ifelse(str_detect(status,"src"),TRUE,FALSE),
naf=ifelse(str_detect(status,"naf"),TRUE,FALSE),
af=ifelse(str_detect(status,"af") & !str_detect(status,"laf") & !str_detect(status,"naf"),TRUE,FALSE),
laf=ifelse(str_detect(status,"laf"),TRUE,FALSE),
egc=ifelse(str_detect(status,"egc"),TRUE,FALSE),
status=ifelse(status=="",NA,status))
## split the script into sentences
sentence <- data.frame(split_sentence(script)) %>% rename(sentence=1)
## extract the sentence containing public float
public_float <- sentence %>%
filter(str_detect(sentence,"affiliates"),
str_detect(sentence,"(?i)value"),
str_detect(sentence,"(?i)\\$"))
#if public float is non-empty
if (nrow(public_float)>0){
public_float <- public_float %>%
group_by(row_number()) %>% #iterate in each sentence
mutate(# remove share price from the sentences
sentence=ifelse(!is.na(str_extract(sentence,"\\$[^\\$]+par value|\\$[^\\$]+per share")) &
nchar(str_extract(sentence,"\\$[^\\$]+par value|\\$[^\\$]+per share"))<20,
str_remove_all(sentence,"\\$[^\\$]+par value|\\$[^\\$]+per share"),sentence),
# extract the float and convert it to dollar value
publicfloat=case_when(!is.na(str_extract(sentence,"\\$[^\\$]+thousand")) & str_detect(str_extract(sentence,"\\$[^\\$]+thousand"),"\\d.") ~
parse_number(str_extract(sentence,"\\$[^\\$]+thousand"))*1000,
!is.na(str_extract(sentence,"\\$[^\\$]+million")) & str_detect(str_extract(sentence,"\\$[^\\$]+million"),"\\d.") ~
parse_number(str_extract(sentence,"\\$[^\\$]+million"),"|")*1000000,
!is.na(str_extract(sentence,"\\$[^\\$]+billion")) & str_detect(str_extract(sentence,"\\$[^\\$]+billion"),"\\d.") ~
parse_number(str_extract(sentence,"\\$[^\\$]+billion"))*1000000000,
!is.na(str_extract(sentence,"\\$[^\\$]+trillion")) & str_detect(str_extract(sentence,"\\$[^\\$]+trillion"),"\\d.") ~
parse_number(str_extract(sentence,"\\$[^\\$]+trillion"))*1000000000000,
!is.na(str_extract_all(str_remove_all(sentence," "),"\\$([0-9,.]+)")) &
!is.na(match(0,lapply(str_extract_all(str_remove_all(sentence," "),"\\$([0-9,.]+)"),parse_number)[[1]])) ~
0,
!is.na(str_extract_all(str_remove_all(sentence," "),"\\$([0-9,.]+)")) &
is.na(match(0,lapply(str_extract_all(str_remove_all(sentence," "),"\\$([0-9,.]+)"),parse_number)[[1]])) ~
max(lapply(str_extract_all(str_remove_all(sentence," "),"\\$([0-9,.]+)"),parse_number)[[1]])),
publicfloat=ifelse(is.infinite(publicfloat),NA,publicfloat)) %>%
ungroup() %>% select(-`row_number()`) %>%
filter(sum(!is.na(publicfloat))==0 | !is.na(publicfloat))
## save data into parse_df and add cik, file path, and file date
i_parse_df <- merge(public_float, filer_status)%>%
mutate(cik=cik_list[i], filepath=file, filedate=file_date)
parse_df <- bind_rows(parse_df,i_parse_df)
}
else{#if public float is empty
parse_df <- bind_rows(parse_df,filer_status%>% mutate(cik=cik_list[i], filepath=file, filedate=file_date))
}
## print out the cik and its index in cik_list, to inform that you complete the data collection for this filer
print(paste("cik:",cik_list[i],"; index: ",i))
}
}
## save results
write_csv(parse_df,"filer_status_public_float.csv")