ceg_process.R 2.95 KiB
options(warn=-1)
suppressPackageStartupMessages(library(tidyverse))
suppressMessages(
text_desc <- read_tsv("data/CegInfo.txt") %>% rowwise() %>%
  mutate(type.short = str_split(Categori, "\\.")[[1]][1],
         type.cy = set_names(c("Ffeithiol", "Dychmygol"), c("Rh Ff", "Rh Dd"))[type.short],
         type.en = set_names(c("Nonfiction", "Fiction"), c("Rh Ff", "Rh Dd"))[type.short],
         gen.short = str_split(Categori, "\\.")[[1]][2],
         gen.cy = set_names(
           c("Gwasg - Gwyddonol", "Gwasg - Adroddiad", "Gwasg - Golygyddol",
             "Gwasg - Adolygiad", "Gwasg - Llythyrau", "Plant - Ffeithiol",
             "Ysgrythurol", "Bro a Bywyd Gwerin", "Gweinyddol - Adroddiad",
             "Gweinyddol - Llythyrau", "Gweinyddol - Cofnodion / Cytundebau",
             "Academaidd", "Hunangofiant / Cofiant / Dyddiaduron / Atgofion",
             "Sgyrsiau / Pigion", "Medrau a Diddordebau",
             "Nofelau", "Straeon Byrion", "Plant - Nofel",
             "Plant - Straeon", "Dyddiadur Dychmygol", "Ysgrifau"),
           c("G Gw", "G A", "G G", "G Ad", "G Ll", "P Ff", "Y", "B", "Gw Ad", "Gw Ll",
             "Gw C", "A", "H", "S", "M", "N", "S B", "P N", "P S", "D", "Ys")
         )[gen.short],
         gen.en = set_names(
           c("Press - Scientific", "Press - Report", "Press - Editorial",
             "Press - Review", "Press - Letters", "Children's Nonfiction",
             "Scriptural", "Community Life", "Administrative - Report",
             "Administrative - Letters", "Administrative - Minutes / Contracts",
             "Academic", "Biography / Diaries / Memories", "Discussions / Highlights",
             "Skills and Interests", "Novels", "Short Stories",
             "Children's Novel", "Children's Stories", "Fiction Diaries",
             "Articles / Essays"),
           c("G Gw", "G A", "G G", "G Ad", "G Ll", "P Ff", "Y", "B", "Gw Ad", "Gw Ll",
             "Gw C", "A", "H", "S", "M", "N", "S B", "P N", "P S", "D", "Ys")
         )[gen.short]
write_csv(text_desc, "intermediate/CegInfo.csv")
suppressWarnings(
for(i in 1:500) {
  if(i %in% c(50, 100, 150, 200, 250, 300, 350, 400, 450, 500)) {
    message(paste("Progress: ", i, "/", 500, sep=""))
  suppressMessages(text <- read_table2(paste("intermediate/", i, ".pre", sep="")))
  text <- text[c("word", "pos", "lemma", "mut")] %>%
    mutate(mut=if_else(mut=="nm", "dim", mut),
           mut=if_else(is.na(mut), "dim", mut)) %>%
    filter(mut %in% c("dim", "meddal", "trwynol", "llaes", "h-llaf")) %>%
    rowwise() %>% mutate(word=str_split(word, "\\(")[[1]][1],
                         form=if_else(lengths(str_split(lemma, ":")) > 1,
                                      str_split(lemma, ":")[[1]][2],
                                      ""),
                         lemma=str_split(lemma, ":")[[1]][1],
                         pos=paste(pos, form, sep="")) %>% select(-form)
  write_tsv(text, paste("intermediate/", i, ".post", sep=""), col_names = F)
options(warn=0)