An error occurred while loading the file. Please try again.
-
Jonathan Dallas Jones authoredb2205618
ceg_process.R 2.95 KiB
options(warn=-1)
suppressPackageStartupMessages(library(tidyverse))
suppressMessages(
text_desc <- read_tsv("data/CegInfo.txt") %>% rowwise() %>%
mutate(type.short = str_split(Categori, "\\.")[[1]][1],
type.cy = set_names(c("Ffeithiol", "Dychmygol"), c("Rh Ff", "Rh Dd"))[type.short],
type.en = set_names(c("Nonfiction", "Fiction"), c("Rh Ff", "Rh Dd"))[type.short],
gen.short = str_split(Categori, "\\.")[[1]][2],
gen.cy = set_names(
c("Gwasg - Gwyddonol", "Gwasg - Adroddiad", "Gwasg - Golygyddol",
"Gwasg - Adolygiad", "Gwasg - Llythyrau", "Plant - Ffeithiol",
"Ysgrythurol", "Bro a Bywyd Gwerin", "Gweinyddol - Adroddiad",
"Gweinyddol - Llythyrau", "Gweinyddol - Cofnodion / Cytundebau",
"Academaidd", "Hunangofiant / Cofiant / Dyddiaduron / Atgofion",
"Sgyrsiau / Pigion", "Medrau a Diddordebau",
"Nofelau", "Straeon Byrion", "Plant - Nofel",
"Plant - Straeon", "Dyddiadur Dychmygol", "Ysgrifau"),
c("G Gw", "G A", "G G", "G Ad", "G Ll", "P Ff", "Y", "B", "Gw Ad", "Gw Ll",
"Gw C", "A", "H", "S", "M", "N", "S B", "P N", "P S", "D", "Ys")
)[gen.short],
gen.en = set_names(
c("Press - Scientific", "Press - Report", "Press - Editorial",
"Press - Review", "Press - Letters", "Children's Nonfiction",
"Scriptural", "Community Life", "Administrative - Report",
"Administrative - Letters", "Administrative - Minutes / Contracts",
"Academic", "Biography / Diaries / Memories", "Discussions / Highlights",
"Skills and Interests", "Novels", "Short Stories",
"Children's Novel", "Children's Stories", "Fiction Diaries",
"Articles / Essays"),
c("G Gw", "G A", "G G", "G Ad", "G Ll", "P Ff", "Y", "B", "Gw Ad", "Gw Ll",
"Gw C", "A", "H", "S", "M", "N", "S B", "P N", "P S", "D", "Ys")
)[gen.short]
)
)
write_csv(text_desc, "intermediate/CegInfo.csv")
suppressWarnings(
for(i in 1:500) {
if(i %in% c(50, 100, 150, 200, 250, 300, 350, 400, 450, 500)) {
message(paste("Progress: ", i, "/", 500, sep=""))
}
suppressMessages(text <- read_table2(paste("intermediate/", i, ".pre", sep="")))
text <- text[c("word", "pos", "lemma", "mut")] %>%
mutate(mut=if_else(mut=="nm", "dim", mut),
mut=if_else(is.na(mut), "dim", mut)) %>%
filter(mut %in% c("dim", "meddal", "trwynol", "llaes", "h-llaf")) %>%
rowwise() %>% mutate(word=str_split(word, "\\(")[[1]][1],
form=if_else(lengths(str_split(lemma, ":")) > 1,
str_split(lemma, ":")[[1]][2],
""),
lemma=str_split(lemma, ":")[[1]][1],
pos=paste(pos, form, sep="")) %>% select(-form)
write_tsv(text, paste("intermediate/", i, ".post", sep=""), col_names = F)
}
)
options(warn=0)