;; optimized 4x4 matrix only!!!
;; NEED TO ALSO DO integers and doubles!!
(bind-func mmmul
(lambda (m1:float* m2:float* result:float*)
(let ((row1:/4,float/* (cast (pref-ptr m2 0)))
(row2:/4,float/* (cast (pref-ptr m2 4)))
(row3:/4,float/* (cast (pref-ptr m2 8)))
(row4:/4,float/* (cast (pref-ptr m2 12)))
(out:/4,float/* (cast result))
(tmp:/4,float/* null) (i 0))
(dotimes (i 4)
(set! tmp (cast (pref-ptr m1 (* 4 i))))
(let ((a (vshuffle tmp null 0 0 0 0))
(b (vshuffle tmp null 1 1 1 1))
(c (vshuffle tmp null 2 2 2 2))
(d (vshuffle tmp null 3 3 3 3)))
(pset! out i
(+ (+ (* a row1)
(* b row2))
(+ (* c row3)
(* d row4))))))
result)))