I have a dataframe, an example in .csv format is shown below, that has a list of words (Word), the number of sounds in those words (NumSounds), and the transcription of the sounds in each word (Pronunciation). I have been trying to create a file that shows me what the minimal pairs are for each word in the list. This means, for every word, I need to know which other words in the list have an edit distance of 1 from that word while also having the same number of sounds. I have been doing this is R. The loop that my lab-mate wrote is shown below:
library(stringdist)
words = vector(mode="character", length=nrow(df))
pairs = vector(mode="character", length=nrow(df))
pb = txtProgressBar(min=0, max=nrow(df), style=3)
for(i in 1:nrow(df)) {
word = df$Pronunciation[i]
nphones = df$NumSounds[i]
potential_minimal_pairs = as.list(df$Pronunciation[df$Word != word & df$NumSounds == nphones])
distances = stringdist(word, potential_minimal_pairs, method="lv")
minimal_pairs = potential_minimal_pairs[distances == 1]
word = unique(df$Word[datf$Pronunciation == word])[1]
words = append(words, word)
words[i] = word
minimal_pairs = sapply(pairs, function(x) unique(df$Word[datf$Pronunciation == x])[1])
pairs[i] = paste(minimal_pairs, ", ")
setTxtProgressBar(pb, i)
}
myminimalpairs = data.frame(word=words, pairs=pairs)
head(myminimalpairs, 10)
Word,NumSounds,Pronunciation
abbey,3,&bi
abide,4,^b#d
abort,5,^b>rt
abroad,5,^br>d
abrupt,6,^br^pt
absence,6,&bs^ns
absent,6,&bs^nt
absorb,6,^bz>rb
absorbed,7,^bz>rbd
abstract,8,&bstr&kt
abused,6,^byuzd
abyss,4,^bIs
accents,7,&ksEnts
accepts,7,&ksEpts
accessed,6,&ksEst
accord,5,^k>rd
accuse,5,^kyuz
achieve,4,^Civ
achieved,5,^Civd
aching,4,ekIN