#include <stdio.h>
#include <errno.h>
#include <string.h>

#include <ppm.h>

#include "blur.h"


void do_blur(pixel **out_image, pixel **in_image, int n_cols, int n_rows) {
  
  float filter_div = 16.0;
  float filter_div_rcp = 0.0;
  static float filter[3][3] = {{1.0, 2.0, 1.0},
			       {2.0, 4.0, 2.0},
			       {1.0, 2.0, 1.0}};
  int x;
  int y;
  int i;
  int j;

  pixel vpix;
  
  int res_r;
  int res_g;
  int res_b;

  float tmp_min = 0.0;
  float tmp_max = 255.0;

  int tmp_x;
  int tmp_y;

  int tmp_r;
  int tmp_g;
  int tmp_b;
  int tmp_filter_index;

  __asm__ __volatile__("femms\n\t"
		       "movd      %1,    %%mm0\n\t"
		       "pfrcp     %%mm0, %%mm1\n\t"
		       "pfrcpit1  %%mm1, %%mm0\n\t"
		       "pfrcpit2  %%mm1, %%mm0\n\t"
		       "movd      %%mm0, %0\n\t"
		       "femms"
		       : "=g" (filter_div_rcp)
		       :  "g" (filter_div)
		       );
    
  __asm__ __volatile__("femms");
  
  __asm__ __volatile__("movd %0, %%mm4\n\t"
		       "punpckldq  %%mm4, %%mm4\n\t"
		       
		       "movd %1, %%mm3\n\t"
		       "punpckldq  %%mm3, %%mm3\n\t"
		       :
		       : "m" (tmp_min), "m" (tmp_max)
		       );
    
  for(y = 0; y < n_rows; y++) {
    for(x = 0; x < n_cols; x++) {
      
      __asm__ __volatile__("pxor %%mm7, %%mm7\n\t"
			   "pxor %%mm6, %%mm6\n\t"
			   :
			   :
			   );
      
      for(j = 0; j < 3; j++) {
	for(i = 0; i < 3; i++) {
	  tmp_x = x - (3>>1) + i;
	  tmp_y = y - (3>>1) + j;
	  
	  if (tmp_x < 0) tmp_x = 0;
	  if (tmp_x == n_cols) tmp_x = n_cols - 1;
	  if (tmp_y < 0) tmp_y = 0;
	  if (tmp_y == n_rows) tmp_y = n_rows - 1;
	  
	  vpix = in_image[tmp_y][tmp_x];

	  tmp_filter_index = i + 3 * j;
	  tmp_r = PPM_GETR(vpix);
	  tmp_g = PPM_GETG(vpix);
	  tmp_b = PPM_GETB(vpix);
	  
	  __asm__ __volatile__("movq  filter.0(,%3,4), %%mm5\n\t"
			       "punpckldq  %%mm5, %%mm5\n\t"
			       
			       "movd   %0, %%mm0\n\t"
			       "movd   %1, %%mm1\n\t"
			       "punpckldq  %%mm1, %%mm0\n\t"
			       "pi2fd      %%mm0, %%mm0\n\t"
			       
			       "movd       %2, %%mm1\n\t"
			       "pi2fd      %%mm1, %%mm1\n\t"
				 
			       "pfmul      %%mm5, %%mm0\n\t"
			       "pfadd      %%mm0, %%mm7\n\t"
			       
			       "pfmul      %%mm5, %%mm1\n\t"
			       "pfadd      %%mm1, %%mm6\n\t"
			       :
			       : "r" (tmp_r), "r" (tmp_g), "r" (tmp_b), "r" (tmp_filter_index), "r" (filter)
			       );
	
	}
      }
	
      __asm__ __volatile__("movd %3, %%mm5\n\t"
			   "punpckldq  %%mm5, %%mm5\n\t"
			   
			   "pfmul      %%mm5, %%mm7\n\t"
			   "pfmul      %%mm5, %%mm6\n\t"
			   
			   "pfmax      %%mm4, %%mm7\n\t"
			   "pfmax      %%mm4, %%mm6\n\t"
			   "pfmin      %%mm3, %%mm7\n\t"
			   "pfmin      %%mm3, %%mm6\n\t"
			   
			   "pf2id      %%mm7, %%mm7\n\t"
			   "pf2id      %%mm6, %%mm6\n\t"
			   
			   "movd       %%mm7, %0\n\t"
			   "punpckhdq  %%mm7, %%mm7\n\t"
			   "movd       %%mm7, %1\n\t"
			   "movd       %%mm6, %2\n\t"
			   : "=g" (res_r), "=g" (res_g), "=g" (res_b)
			   : "g" (filter_div_rcp)
			   );
      	
      PPM_ASSIGN(out_image[y][x],
		 (pixval) res_r,
		 (pixval) res_g,
		 (pixval) res_b);
    }
  }
}
