File: /home/slfopp7cb1df/www/inventorypacket.com/vendor/khaled.alshamaa/ar-php/docs/models/ar_norm-v4.R
ar_norm <- function(text){
# remove mentions
text <- gsub('@\\S+', '', text)
# remove hashtags
text <- gsub('#\\S+', '', text)
# normalise Alef
chars <- 'أإآى'
text <- gsub(paste0('[', chars, ']'), 'ا', text)
# normalise Hamza
chars <- 'ؤئء'
text <- gsub(paste0('[', chars, ']'), 'ء', text)
# replace taa marbouta by haa (taa maftouha)
text <- gsub('ة', 'ه', text)
#text <- gsub('ة', 'ت', text)
# remove longation
text <- gsub('و+', 'و', text)
text <- gsub('ي+', 'ي', text)
text <- gsub('ا+', 'ا', text)
# filter only Arabic text (white list)
chars <- 'ءابتثجحخدذرزسشصضطظعغفقكلمنهوي'
text <- gsub(paste0('[^ ', chars, ']+'), ' ', text)
# exclude one letter words
text <- gsub('\\b\\S{1}\\b', ' ', text)
# remove extra spaces
text <- trimws(text)
text <- gsub('\\s{2,}', ' ', text)
return(text)
}
cmp_ar_norm <- cmpfun(ar_norm)
get_pairs <- function(word){
letters <- strsplit(word, '')[[1]]
stems <- combn(letters, 2)
pairs <- paste0(stems[1,], stems[2,])
return(pairs)
}
cmp_get_pairs <- cmpfun(get_pairs)