# Awk script that calculates the Viterbi optimal path for a 2 states (A,B) model with 2 possible emissions (X,Y) # The probabilities are set as the CpG Island example in the course Genomica Computazionale. Modify them to run on different models #Example1: printf "Y\nY\nY\nX\nX\nY\n" | awk -f viterbi.awk #Example2: cat FILE | awk -f viterbi.awk #where FILE contains a X and a Y for each row BEGIN { # ----- STATES ----- states[1] = "A" states[2] = "B" # ----- INITIAL PROBABILITIES π ----- pi["A"] = 0.5 pi["B"] = 0.5 # ----- TRANSITIONS a(i→j) ----- a["A","A"] = 0.9; a["A","B"] = 0.1 a["B","A"] = 0.5; a["B","B"] = 0.5 # ----- EMISSIONS e(state, symbol) ----- e["A","X"] = 0.02; e["A","Y"] = 0.98 e["B","X"] = 0.8; e["B","Y"] = 0.2 } # Read one symbol per line { T++ obs[T] = $1 } END { if (T == 0) { print "No observations!" exit 1 } # ----- INITIALIZATION ----- for (s in states) { st = states[s] dp[1,st] = pi[st] * e[st, obs[1]] back[1,st] = "START" } # ----- RECURSION ----- for (t = 2; t <= T; t++) { o = obs[t] for (s in states) { curr = states[s] bestProb = -1 bestPrev = "" for (sp in states) { prev = states[sp] cand = dp[t-1,prev] * a[prev, curr] if (cand > bestProb) { bestProb = cand bestPrev = prev } } dp[t,curr] = bestProb * e[curr,o] back[t,curr] = bestPrev } } # ----- TERMINATION ----- finalBest = -1 finalState = "" for (s in states) { st = states[s] if (dp[T,st] > finalBest) { finalBest = dp[T,st] finalState = st } } # ----- BACKTRACK ----- path[T] = finalState for (t = T-1; t >= 1; t--) { path[t] = back[t+1, path[t+1]] } # ----- OUTPUT ----- for (t = 1; t <= T; t++) printf "%s\t%s\n", obs[t], path[t] }