# IPA symbols Unicode codes ----

data("ipa_symbols", envir = environment())

## Official IPA ----

ipa_symbols_unicode <- c(
  "0061", "00E6", "0250", "0251", "0252", "0062", "0299", "0253", "0063",
  "00E7", "0255", "0064", "00F0", "0256", "0257", "0065", "0259", "025B",
  "0258", "025C", "025E", "0066", "0261", "0262", "0260", "029B", "0264",
  "0263", "0068", "0127", "029C", "0266", "0267", "0265", "0069", "026A",
  "0268", "006A", "029D", "025F", "0284", "006B", "006C", "029F", "026C",
  "026D", "026E", "028E", "006D", "0271", "006E", "0274", "0272", "0273",
  "014B", "006F", "00F8", "0153", "0276", "0254", "0275", "0070", "0278",
  "0071", "0072", "0280", "0279", "027A", "027B", "027D", "027E", "0281",
  "0073", "0282", "0283", "0074", "0288", "0075", "0289", "026F", "0270",
  "028A", "0076", "028B", "2C71", "028C", "0077", "028D", "0078", "0079",
  "028F", "007A", "0290", "0291", "0292", "0294", "0295", "02A1", "02A2",
  "01C0", "01C1", "01C2", "01C3", "0298", "03B2", "03B8", "03C7", "0334",
  "033C", "032A", "033B", "033A", "031F", "0320", "031D", "031E", "0318",
  "0319", "031C", "0339", "032C", "0325", "0330", "0324", "0329", "032F",
  "0303", "0308", "033D", "0306", "031A", "02DE", "02E1", "207F", "02B7",
  "02B2", "02E0", "02E4", "02B0", "02BC", "02D0", "02D1", "0361", "02C8",
  "02CC", "02E5", "02E6", "02E7", "02E8", "02E9", "A71B", "A71C", "2191",
  "2193", "2197", "2198", "0020", "002E", "007C", "2016", "203F", "030A",
  "0067", "030B", "0301", "0304", "0300", "030F", "0302", "030C", "1DC4",
  "1DC5", "1DC6", "1DC7", "1DC8", "1DC9", "035C", "203C", "1D91", "0348",
  "0349", "0353", "032E", "0347", "02C0", "02B1", "1D31", "0327"
)

## IPA extensions ----

ipa_extensions <- c("1D00", "1D01", "1D02", "1D03", "1D04", "1D05", "1D06",
  "1D07", "1D08", "1D09", "1D0A", "1D0B", "1D0C", "1D0D", "1D0E", "1D0F",
  "1D10", "1D11", "1D12", "1D13", "1D14", "1D15", "1D16", "1D17", "1D18",
  "1D19", "1D1A", "1D1B", "1D1C", "1D1D", "1D1E", "1D1F", "1D20", "1D21",
  "1D22", "1D23", "1D24", "1D25", "1D26", "1D27", "1D28", "1D29", "1D2A",
  "1D2B", "1D2C", "1D2D", "1D2E", "1D2F", "1D30", "1D31", "1D32", "1D33",
  "1D34", "1D35", "1D36", "1D37", "1D38", "1D39", "1D3A", "1D3B", "1D3C",
  "1D3D", "1D3E", "1D3F", "1D40", "1D41", "1D42", "1D43", "1D44", "1D45",
  "1D46", "1D47", "1D48", "1D49", "1D4A", "1D4B", "1D4C", "1D4D", "1D4E",
  "1D4F", "1D50", "1D51", "1D52", "1D53", "1D54", "1D55", "1D56", "1D57",
  "1D58", "1D59", "1D5A", "1D5B", "1D5C", "1D5D", "1D5E", "1D5F", "1D60",
  "1D61", "1D62", "1D63", "1D64", "1D65", "1D66", "1D67", "1D68", "1D69",
  "1D6A", "1D6B", "1D6C", "1D6D", "1D6E", "1D6F", "1D70", "1D71", "1D72",
  "1D73", "1D74", "1D75", "1D76", "1D77", "1D78", "1D79", "1D7A", "1D7B",
  "1D7C", "1D7D", "1D7E", "1D7F"
)

## IPA extensions supplement ----

ipa_supplements <- c("1D80", "1D81", "1D82", "1D83", "1D84", "1D85", "1D86",
  "1D87", "1D88", "1D89", "1D8A", "1D8B", "1D8C", "1D8D", "1D8E", "1D8F",
  "1D90", "1D91", "1D92", "1D93", "1D94", "1D95", "1D96", "1D97", "1D98",
  "1D99", "1D9A", "1D9B", "1D9C", "1D9D", "1D9E", "1D9F", "1DA0", "1DA1",
  "1DA2", "1DA3", "1DA4", "1DA5", "1DA6", "1DA7", "1DA8", "1DA9", "1DAA",
  "1DAB", "1DAC", "1DAD", "1DAE", "1DAF", "1DB0", "1DB1", "1DB2", "1DB3",
  "1DB4", "1DB5", "1DB6", "1DB7", "1DB8", "1DB9", "1DBA", "1DBB", "1DBC",
  "1DBD", "1DBE", "1DBF"
)

# All IPA characters ----

ipa_chars <- intToUtf8(
  Unicode::as.u_char(
    c(ipa_symbols_unicode, ipa_extensions, ipa_supplements)
  ), multiple = TRUE
)

# IPA diacritics ----

## Official diacritics ----

ipa_diacritics_unicode <- c(
  "0334", "033C", "032A", "033B", "033A", "031F", "0320", "031D", "031E",
  "0318", "0319", "031C", "0339", "032C", "0325", "0330", "0324", "0329",
  "032F", "0303", "0308", "033D", "0306", "031A", "02DE", "02E1", "207F",
  "02B7", "02B2", "02E0", "02E4", "02B0", "02BC", "02D0", "02D1", "0361",
  "02C8", "02CC", "02E5", "02E6", "02E7", "02E8", "02E9", "A71B", "A71C",
  "2191", "2193", "2197", "2198", "203F", "030A", "030B", "0301", "0304",
  "0300", "030F", "0302", "030C", "1DC4", "1DC5", "1DC6", "1DC7", "1DC8",
  "1DC9", "035C", "0348", "0349", "0353", "032E", "0347", "02C0", "02B1",
  "1D31", "0327"
)

ipa_diacritics <- intToUtf8(
  Unicode::as.u_char(ipa_diacritics_unicode),
  multiple = TRUE
)

diacritics <- c(
  "0334", "033C", "032A", "033B", "033A", "031F", "0320", "031D", "031E",
  "0318", "0319", "031C", "0339", "032C", "0325", "0330", "0324", "0329",
  "032F", "0303", "0308", "033D", "0306", "031A", "02DE", "02E1", "207F",
  "02B7", "02B2", "02E0", "02E4", "02B0", "02BC", "02D0", "02D1", "0361",
  "030A", "035C", "0348", "0349", "0353", "032E", "0347", "02C0", "02B1",
  "1D31"
)

diacritics_regex <- paste0(
  ".(",
  stringr::str_flatten(paste0("\\u", diacritics), collapse = "|"),
  ")+"
)

rm_diacritics_regex <- paste0(
  "(",
  stringr::str_flatten(paste0("\\u", diacritics), collapse = "|"),
  ")+"
)

## Tone diacritics and letters ----

tones <- c("\u02E5", "\u02E6", "\u02E7", "\u02E8", "\u02E9", "\uA71B", "\uA71C",
"\u2191", "\u2193", "\u2197", "\u2198", "\u203F", "\u030A", "\u030B", "\u0301", "\u0304",
"\u0300", "\u030F", "\u0302", "\u030C", "\u1DC4", "\u1DC5", "\u1DC6", "\u1DC7", "\u1DC8",
"\u1DC9")

## Prenasalised diacritics ----

ipa_prenasal_unicode <- c("1D50", "1D51", "1DAC", "1DAE", "1DAF", "1DB0", "207F")

prenasal_regex <- paste0(
  "(",
  stringr::str_flatten(paste0("\\u", ipa_prenasal_unicode), collapse = "|"),
  ")."
)

# Affricates ----

affricates <- c(
  "pf", "bv", "ts", "dz", "t\u0283", "d\u0292", "t\u0255", "t\u0291", "c\u00E7",
  "\u025F\u029D", "kx", "\u0261\u0263", "q\u03C7", "\u0262\u0281"
)

affricates_regex <- stringr::str_flatten(affricates, collapse = "|")

# Vowel sequences ----

vowels <- dplyr::filter(ipa_symbols, type == "vowel")

vowels_regex <- paste0(
  "(",
  stringr::str_flatten(vowels$IPA, collapse = "|"),
  "){2,}"
)
