=begin
	$Id: mmx.rb,v 1.9 2003/07/11 14:09:45 matju Exp $

	GridFlow
	Copyright (c) 2001,2002 by Mathieu Bouchard

	This program is free software; you can redistribute it and/or
	modify it under the terms of the GNU General Public License
	as published by the Free Software Foundation; either version 2
	of the License, or (at your option) any later version.

	See file ../COPYING for further informations on licensing terms.

	This program is distributed in the hope that it will be useful,
	but WITHOUT ANY WARRANTY; without even the implied warranty of
	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
	GNU General Public License for more details.

	You should have received a copy of the GNU General Public License
	along with this program; if not, write to the Free Software
	Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
=end

STDOUT.reopen ARGV[0], "w"
$loader = File.open ARGV[1], "w"
$count = 0
$lines = 0

puts "; generated by/for GridFlow 0.7.3"
$loader.puts "#include \"../base/grid.h.fcs\"\nextern \"C\" {"

# this class is not really used yet (only self.make)
class AsmFunction
	def initialize(name)
		@name = name
		@label_count = 1
	end
	def self.make(name)
		puts "", "GLOBAL #{name}", "#{name}:"
		puts "push ebp", "mov ebp,esp", "push esi", "push edi"
		yield AsmFunction.new(name)
		puts "pop edi", "pop esi", "leave", "ret", ""
	end
	def make_until(*ops)
		a = @label_count
		b = @label_count+1
		@label_count+=2
		ops[-1]<<" #{@name}_#{b}"
		puts "#{@name}_#{a}: ", *ops
		yield
		puts "jmp #{@name}_#{a}"
		puts "#{@name}_#{b}:"
	end
end

$sizeof = {
	:uint8 => 1,
	:int16 => 2,
	:int32 => 4,
	:int64 => 8,
	:float32 => 4,
	:float64 => 8,
}

$accum = {
	:uint8 => "al",
	:int16 => "ax",
	:int32 => "eax",
}

$asm_type = {
	:uint8 => "byte",
	:int16 => "word",
	:int32 => "dword",
	:int64 => "qword",
}

$opcodes = {
#                      x86    mmx u8  i16     i32     i64
	:add => ["+",  "add", "paddb","paddw","paddd","paddq"],
	:sub => ["-",  "sub", "psubb","psubw","psubd","psubq"],
	:and => ["&",  "and", "pand", "pand", "pand" ,"pand" ],
	:xor => ["^",  "xor", "pxor", "pxor", "pxor" ,"pxor" ],
	:or  => ["|",  "or",  "por",  "por",  "por"  ,"por"  ],
#	:eq  => ["==", nil,   "pcmpeqb","pcmpeqw","pcmpeqd",nil],
#	:gt  => [">",  nil,   "pcmpgtb","pcmpgtw","pcmpgtd",nil],
#	:shl => ["<<", "shl", nil,    "psllw","pslld","psllq"], # noncommutative
#	:shr => [">>", "sar", nil,    "psraw","psrad"], # noncommutative
#	:addclamp => [nil,nil,"paddusb","paddsw",nil,nil,nil],
#	:subclamp => [nil,nil,"psubusb","psubsw",nil,nil,nil],
#	:andnot => [nil,nil,"pandn","pandn","pandn","pandn"],
}

$decls = ""
$install = ""

def make_fun_map(op,type)
	s="mmx_#{type}_map_#{op}"
	size = $sizeof[type]
	accum = $accum[type]
	sym = $opcodes[op][0]
	opcode = $opcodes[op][1]
	mopcode = $opcodes[op][size+(size<4 ? 1 : 0)]
	return if not mopcode
	AsmFunction.make(s) {|a|
		puts "mov ecx,[ebp+8]", "mov esi,[ebp+12]", "mov eax,[ebp+16]"
		puts "mov dx,ax", "shl eax,8", "mov al,dl" if size==1
		puts "mov edx,eax", "shl eax,16", "mov ax,dx" if size<=2
		puts "push eax", "push eax", "movq mm7,[esp]", "add esp,8"
		foo = proc {|n|
			a.make_until("cmp ecx,#{8/size*n}","jb near") {
				0.step(n,4) {|k|
				nn=[n-k,4].min
				o=(0..3).map{|x| 8*(x+k) }
				for i in 0...nn do puts "movq mm#{i},[esi+#{o[i]}]" end
				for i in 0...nn do puts "#{mopcode} mm#{i},mm7" end
				for i in 0...nn do puts "movq [esi+#{o[i]}],mm#{i}" end
				}
				puts "lea esi,[esi+#{8*n}]", "lea ecx,[ecx-#{8/size*n}]"
			}
		}
		foo.call 4
		foo.call 1
		a.make_until("test ecx,ecx", "jz") {
			puts "#{opcode} #{$asm_type[type]} [esi],#{accum}", "lea esi,[esi+#{size}]"
			puts "dec ecx"
		}
		puts "emms"
	}
	$decls << "void #{s}(int,#{type}*,#{type});\n"
	$install << "FIX2PTR(Numop2,rb_hash_aref(op2_dict,SYM(#{sym})))"
	$install << "->on_#{type}.op_map = #{s};\n"
	$count += 1
end

def make_fun_zip(op,type)
s="mmx_#{type}_zip_#{op}"
	size = $sizeof[type]
	accum = $accum[type]
	sym = $opcodes[op][0]
	opcode = $opcodes[op][1]
	mopcode = $opcodes[op][size+(size<4 ? 1 : 0)]
	return if not mopcode
	AsmFunction.make(s) {|a|
		puts "mov ecx,[ebp+8]",  "mov edi,[ebp+12]",
		     "mov esi,[ebp+16]"#, "mov ebx,[ebp+20]"
		foo = proc {|n|
			a.make_until("cmp ecx,#{8/size*n}","jb near") {
				0.step(n,4) {|k|
				nn=[n-k,4].min
				o=(0..3).map{|x| 8*(x+k) }
				for i in 0...nn do puts "movq mm#{i},[edi+#{o[i]}]" end
				for i in 0...nn do puts "movq mm#{i+4},[esi+#{o[i]}]" end
				for i in 0...nn do puts "#{mopcode} mm#{i},mm#{i+4}" end
				for i in 0...nn do puts "movq [edi+#{o[i]}],mm#{i}" end
				}
				#for i in 0...n do puts "movq [ebx+#{8*i}],mm#{i}" end
				puts "lea edi,[edi+#{8*n}]"
				puts "lea esi,[esi+#{8*n}]"
				#puts "lea ebx,[ebx+#{8*n}]"
				puts "lea ecx,[ecx-#{8/size*n}]"
			}
		}
		foo.call 4
		foo.call 1
		a.make_until("test ecx,ecx", "jz") {
			# requires commutativity ??? fails with shl, shr
			puts "mov #{accum},[esi]"
			puts "#{opcode} #{$asm_type[type]} [edi],#{accum}"
			#puts "mov #{accum},[edi]"
			#puts "#{opcode} #{accum},[esi]"
			#puts "mov [ebx],#{accum}"
			puts "lea edi,[edi+#{size}]"
			puts "lea esi,[esi+#{size}]"
			#puts "lea ebx,[ebx+#{size}]"
			puts "dec ecx"
		}
		puts "emms"
	}
	#$decls << "void #{s}(int,#{type}*,#{type}*,#{type}*);\n"
	$decls << "void #{s}(int,#{type}*,#{type}*);\n"
	$install << "FIX2PTR(Numop2,rb_hash_aref(op2_dict,SYM(#{sym})))"
	$install << "->on_#{type}.op_zip = #{s};\n"
	$count += 1
end

for op in $opcodes.keys do
	for type in [:uint8, :int16#, :int32
	] do
		make_fun_map(op,type)
		make_fun_zip(op,type)
	end
end

$loader.puts $decls
$loader.puts "
}; /* extern */
void startup_cpu () {
	gfpost(\"startup_cpu: using MMX optimisations\");
	#{$install}
}
"

STDERR.puts "automatically generated #{$count} MMX asm functions"

=begin notes:
CPUID has a bit for detecting MMX
PACKSSDW PACKSSWB PACKUSWB = saturation-casting
PCMPxx: Compare Packed Integers
PMULHW, PMULLW: Multiply Packed _unsigned_ 16-bit Integers, and Store
PUNPCKxxx: Unpack and Interleave Data
=end
