# Awk script that calculates the Viterbi optimal path for a 2 states (A,B) model with 2 possible emissions (X,Y)
# The probabilities are set as the CpG Island example in the course Genomica Computazionale. Modify them to run on different models 
#Example1: printf "Y\nY\nY\nX\nX\nY\n" | awk -f viterbi.awk
#Example2: cat FILE | awk -f viterbi.awk #where FILE contains a X and a Y for each row
BEGIN {
    # ----- STATES -----
    states[1] = "A"
    states[2] = "B"

    # ----- INITIAL PROBABILITIES π -----
    pi["A"] = 0.5
    pi["B"] = 0.5

    # ----- TRANSITIONS a(i→j) -----
    a["A","A"] = 0.9;  a["A","B"] = 0.1
    a["B","A"] = 0.5;  a["B","B"] = 0.5

    # ----- EMISSIONS e(state, symbol) -----
    e["A","X"] = 0.02; e["A","Y"] = 0.98
    e["B","X"] = 0.8;  e["B","Y"] = 0.2
}

# Read one symbol per line
{
    T++
    obs[T] = $1
}

END {
    if (T == 0) {
        print "No observations!"
        exit 1
    }

    # ----- INITIALIZATION -----
    for (s in states) {
        st = states[s]
        dp[1,st]   = pi[st] * e[st, obs[1]]
        back[1,st] = "START"
    }

    # ----- RECURSION -----
    for (t = 2; t <= T; t++) {
        o = obs[t]
        for (s in states) {
            curr = states[s]
            bestProb = -1
            bestPrev = ""

            for (sp in states) {
                prev = states[sp]
                cand = dp[t-1,prev] * a[prev, curr]
                if (cand > bestProb) {
                    bestProb = cand
                    bestPrev = prev
                }
            }
            dp[t,curr]   = bestProb * e[curr,o]
            back[t,curr] = bestPrev
        }
    }

    # ----- TERMINATION -----
    finalBest = -1
    finalState = ""
    for (s in states) {
        st = states[s]
        if (dp[T,st] > finalBest) {
            finalBest = dp[T,st]
            finalState = st
        }
    }

    # ----- BACKTRACK -----
    path[T] = finalState
    for (t = T-1; t >= 1; t--) {
        path[t] = back[t+1, path[t+1]]
    }

    # ----- OUTPUT -----
    for (t = 1; t <= T; t++)
        printf "%s\t%s\n", obs[t], path[t]
}