Skip to main content

By clicking Submit, you agree to the developerWorks terms of use.

The first time you sign into developerWorks, a profile is created for you. Select information in your profile (name, country/region, and company) is displayed to the public and will accompany any content you post. You may update your IBM account at any time.

All information submitted is secure.

  • Close [x]

The first time you sign in to developerWorks, a profile is created for you, so you need to choose a display name. Your display name accompanies the content you post on developerworks.

Please choose a display name between 3-31 characters. Your display name must be unique in the developerWorks community and should not be your email address for privacy reasons.

By clicking Submit, you agree to the developerWorks terms of use.

All information submitted is secure.

  • Close [x]

Unrolling AltiVec, Part 3: Down and dirty loop optimization

Learn how to tailor your code for AltiVec

Return to article


Listing 2. The AltiVec-optimized vectorized code (color_vec.c)


#include <sys/time.h>
#include <math.h>
#include <stdio.h>

void
inc_hsv(int size, unsigned char *red, unsigned char *green, unsigned char *blue) {
	int i;
	unsigned char *r, *g, *b;
	vector float one_255th = { 1.0 / 255, 1.0 / 255, 1.0 / 255, 1.0 / 255 };
	vector float fh_factor = { 6.0 / 252.0, 6.0 / 252.0,
				   6.0 / 252.0, 6.0 / 252.0 };
	vector float vec_zero = { 0.0 };
	vector float vec_half = { 0.5, 0.5, 0.5, 0.5 };
	vector float vec_unity = { 1.0, 1.0, 1.0, 1.0 };
	vector unsigned char u8_1 = vec_splat_u8(1);
	vector unsigned short u16_1 = vec_splat_u16(1);

	// multiplying converts:
	// 0123456789abcdef
	// to:
	// 000044448888cccc
	// 22226666aaaaeeee
	// 111155559999dddd
	// 33337777bbbbffff

	unsigned char inorder_values[16] = {
		0x00, 0x08, 0x04, 0x0c,
		0x01, 0x09, 0x05, 0x0d,
		0x02, 0x0a, 0x06, 0x0e,
		0x03, 0x0b, 0x07, 0x0f
	};
	vector unsigned char inorder = vec_ld(0, inorder_values);
	unsigned char split_values[16] = {
		0x00, 0x08, 0x01, 0x09,
		0x02, 0x0a, 0x03, 0x0b,
		0x04, 0x0c, 0x05, 0x0d,
		0x06, 0x0e, 0x07, 0x0f
	};
	vector unsigned char split = vec_ld(0, split_values);

	vector unsigned char u8_42 = vec_add(vec_splat_u8(10), vec_splat_u8(11));
	// start it at 21, double to get 42.
	u8_42 = vec_add(u8_42, u8_42);

	for (i = 0; i < size; i += 16) {
		int j;

		register vector unsigned char r16, g16, b16;
		register vector unsigned char max16, min16, delta16;
		vector bool char r_max, g_max, b_max;
		vector unsigned char hue_r16, hue_g16, hue_b16;
		vector unsigned char hue16 = (vector unsigned char) {0};
		vector unsigned char sat16;
		unsigned char delta[16];
		unsigned char hue_r[16];
		unsigned char hue_g[16];
		unsigned char hue_b[16];
		vector unsigned char scratch8;
		vector unsigned short scratch[4];
		vector float fh[4];
		vector float fv[4];
		vector float fs[4];
		vector unsigned char v84;
		unsigned char values[16];
		unsigned char sats[16];
		signed short g_b[16], b_r[16], r_g[16];

		r = red + i;
		g = green + i;
		b = blue + i;
		r16 = vec_ld(0, r);
		g16 = vec_ld(0, g);
		b16 = vec_ld(0, b);

		max16 = vec_max(vec_max(r16, g16), b16);
		min16 = vec_min(vec_min(r16, g16), b16);
		delta16 = vec_sub(max16, min16);

		vec_st(delta16, 0, delta);
		vec_st(max16, 0, values);
		for (j = 0; j < 16; ++j) {
			sats[j] = (255 * delta[j] / values[j]);
		}
		sat16 = vec_ld(0, sats);

		// these indicate which values to take from the
		// result registers
		r_max = vec_cmpeq(max16, r16);
		g_max = vec_cmpeq(max16, g16);
		b_max = vec_cmpeq(max16, b16);

		// precalculate x-y*42 values
		scratch8 = vec_perm(g16, g16, split);
		scratch[0] = vec_mule(scratch8, u8_42);
		scratch[1] = vec_mulo(scratch8, u8_42);
		scratch8 = vec_perm(b16, b16, split);
		scratch[2] = vec_mule(scratch8, u8_42);
		scratch[3] = vec_mulo(scratch8, u8_42);
		vec_st(vec_sub((vector signed short) scratch[0], (vector
			signed short) scratch[2]), 0, g_b);
		vec_st(vec_sub((vector signed short) scratch[1], (vector
			signed short) scratch[3]), 16, g_b);

		scratch8 = vec_perm(b16, b16, split);
		scratch[0] = vec_mule(scratch8, u8_42);
		scratch[1] = vec_mulo(scratch8, u8_42);
		scratch8 = vec_perm(r16, r16, split);
		scratch[2] = vec_mule(scratch8, u8_42);
		scratch[3] = vec_mulo(scratch8, u8_42);
		vec_st(vec_sub((vector signed short) scratch[0], (vector
			signed short) scratch[2]), 0, b_r);
		vec_st(vec_sub((vector signed short) scratch[1], (vector
			signed short) scratch[3]), 16, b_r);

		scratch8 = vec_perm(r16, r16, split);
		scratch[0] = vec_mule(scratch8, u8_42);
		scratch[1] = vec_mulo(scratch8, u8_42);
		scratch8 = vec_perm(g16, g16, split);
		scratch[2] = vec_mule(scratch8, u8_42);
		scratch[3] = vec_mulo(scratch8, u8_42);
		vec_st(vec_sub((vector signed short) scratch[0], (vector
			signed short) scratch[2]), 0, r_g);
		vec_st(vec_sub((vector signed short) scratch[1], (vector
			signed short) scratch[3]), 16, r_g);

		// Hue ranges from 0 to 251, because 252 is divisible by 6,
		// and this eliminates a subtle drifting effect.
		for (j = 0; j < 16; ++j) {
			if (delta[j] == 0) {
				hue_r[j] = hue_g[j] = hue_b[j] = 0;
				continue;
			}
			if (r[j] == values[j]) {
				hue_r[j] = g_b[j] / (float) delta[j];
				if (hue_r[j] > 127)
					hue_r[j] -= 4;
			}
			if (g[j] == values[j]) {
				hue_g[j] = b_r[j] / (float) delta[j];
			}
			if (b[j] == values[j]) {
				hue_b[j] = r_g[j] / (float) delta[j];
			}
		}
		v84 = vec_add(u8_42, u8_42); // 42+42 = 84
		hue_r16 = vec_ld(0, hue_r);
		hue_g16 = vec_add(v84, vec_ld(0, hue_g));
		v84 = vec_add(v84, v84); // == 168
		hue_b16 = vec_add(v84, vec_ld(0, hue_b));

		hue16 = vec_sel(hue16, hue_r16, r_max);
		hue16 = vec_sel(hue16, hue_g16, g_max);
		hue16 = vec_sel(hue16, hue_b16, b_max);

		// we now have hue, sat, and value registers of 16 values at
		// once... unfortunately, we need to multiply by a float
		// value, so some splitting up needs to happen
		hue16 = vec_perm(hue16, hue16, inorder);
		scratch[0] = vec_mule(hue16, u8_1);
		scratch[1] = vec_mulo(hue16, u8_1);
		max16 = vec_perm(max16, max16, inorder);
		fh[0] = vec_madd(vec_ctf(vec_mule(scratch[0], u16_1), 0),
			fh_factor, vec_half);
		fh[1] = vec_madd(vec_ctf(vec_mulo(scratch[0], u16_1), 0),
			fh_factor, vec_half);
		scratch[0] = vec_mule(max16, u8_1);
		fh[2] = vec_madd(vec_ctf(vec_mule(scratch[1], u16_1), 0),
			fh_factor, vec_half);
		fh[3] = vec_madd(vec_ctf(vec_mulo(scratch[1], u16_1), 0),
			fh_factor, vec_half);
		scratch[1] = vec_mulo(max16, u8_1);
		sat16 = vec_perm(sat16, sat16, inorder);
		fv[0] = vec_ctf(vec_mule(scratch[0], u16_1), 0);
		fv[1] = vec_ctf(vec_mulo(scratch[0], u16_1), 0);
		scratch[0] = vec_mule(sat16, u8_1);
		fv[2] = vec_ctf(vec_mule(scratch[1], u16_1), 0);
		fv[3] = vec_ctf(vec_mulo(scratch[1], u16_1), 0);
		scratch[1] = vec_mulo(sat16, u8_1);
		fs[0] = vec_ctf(vec_mule(scratch[0], u16_1), 0);
		fs[1] = vec_ctf(vec_mulo(scratch[0], u16_1), 0);
		fs[2] = vec_ctf(vec_mule(scratch[1], u16_1), 0);
		fs[3] = vec_ctf(vec_mulo(scratch[1], u16_1), 0);

		for (j = 0; j < 4; ++j) {
			vector unsigned int sector;
			vector float remainder;
			vector float floor;
			vector float six;
			vector float sub6;
			vector bool int dosub;
			vector float p, q, t;
			vector float vec255 = { 255.0, 255.0, 255.0, 255.0 };
			float ps[4], qs[4], ts[4];
			unsigned int sectors[4];
			int k;

			six = (vector float) { 6.0, 6.0, 6.0, 6.0 };
			dosub = vec_cmpgt(fh[j], six);
			sub6 = vec_sub(fh[j], six);
			fh[j] = vec_sel(fh[j], sub6, dosub);

			floor = vec_floor(fh[j]);
			sector = vec_ctu(floor, 0);
			vec_st(sector, 0, sectors);
			remainder = vec_sub(fh[j], floor);
			{
				/* p = (255 - s) * (max / 255);
				 * q = (255 - s * f) * (max / 255);
				 * t = (255 - s * (1 - f)) * (max / 255);
				 */
				vector float max_255;
				max_255 = vec_madd(fv[j], one_255th, vec_zero);
				p = vec_sub(vec255, fs[j]);
				q = vec_sub(vec255, vec_madd(fs[j], remainder, vec_zero));
				t = vec_sub(vec255,
					vec_madd(fs[j],
						vec_sub(vec_unity, remainder),
						vec_zero));

				p = vec_madd(p, max_255, vec_zero);
				q = vec_madd(q, max_255, vec_zero);
				t = vec_madd(t, max_255, vec_zero);
			}
			vec_st(p, 0, ps);
			vec_st(q, 0, qs);
			vec_st(t, 0, ts);
			for (k = 0; k < 4; ++k) {
				int index = j * 4 + k;

				switch (sectors[k]) {
					case 0:
						r[index] = values[index];
						g[index] = ts[k];
						b[index] = ps[k];
						break;
					case 1:
						r[index] = qs[k];
						g[index] = values[index];
						b[index] = ps[k];
						break;
					case 2:
						r[index] = ps[k];
						g[index] = values[index];
						b[index] = ts[k];
						break;
					case 3:
						r[index] = ps[k];
						g[index] = qs[k];
						b[index] = values[index];
						break;
					case 4:
						r[index] = ts[k];
						g[index] = ps[k];
						b[index] = values[index];
						break;
					default:		// case 5:
						r[index] = values[index];
						g[index] = ps[k];
						b[index] = qs[k];
						break;
				}
			}

		}
	}
}

struct { int r, g, b; } colors[] = {
	{ 0xff, 0x00, 0x00 },
	{ 0xff, 0x2f, 0x00 },
	{ 0xff, 0x5f, 0x00 },
	{ 0xff, 0x8f, 0x00 },
	{ 0xff, 0xbf, 0x00 },
	{ 0xff, 0xef, 0x00 },
	{ 0xdf, 0xff, 0x00 },
	{ 0xaf, 0xff, 0x00 },
	{ 0x7f, 0xff, 0x00 },
	{ 0x4f, 0xff, 0x00 },
	{ 0x1f, 0xff, 0x00 },
	{ 0x00, 0xff, 0x0f },
	{ 0x00, 0xff, 0x3f },
	{ 0x00, 0xff, 0x6f },
	{ 0x00, 0xff, 0x9f },
	{ 0x00, 0xff, 0xcf },
	{ 0x00, 0xff, 0xff },
	{ 0x00, 0xcf, 0xff },
	{ 0x00, 0x9f, 0xff },
	{ 0x00, 0x6f, 0xff },
	{ 0x00, 0x3f, 0xff },
	{ 0x00, 0x0f, 0xff },
	{ 0x1f, 0x00, 0xff },
	{ 0x4f, 0x00, 0xff },
	{ 0x7f, 0x00, 0xff },
	{ 0xaf, 0x00, 0xff },
	{ 0xdf, 0x00, 0xff },
	{ 0xff, 0x00, 0xef },
	{ 0xff, 0x00, 0xbf },
	{ 0xff, 0x00, 0x8f },
	{ 0xff, 0x00, 0x5f },
	{ 0xff, 0x00, 0x2f },
};

unsigned char red[65536], green[65536], blue[65536];

int
main(void) {
	struct timezone dontcare = { 0, 0 };
	struct timeval before, after;
	long long microsec;
	int i, j = 0;

	// set up arrays from our little rainbow
	for (i = 0; i < 65536; ++i) {
		red[i] = colors[j].r;
		green[i] = colors[j].g;
		blue[i] = colors[j].b;
		++j;
		if (j == (sizeof(colors) / sizeof(*colors))) {
			j = 0;
		}
	}
	for (j = 0; j < 32; j += 2)
		printf("(%d) { %02x %02x %02x }\n", j, red[j], green[j], blue[j]);

	gettimeofday(&before, &dontcare);
	for (i = 0; i < 200; ++i) {
		inc_hsv(65536, red, green, blue);
	}
	gettimeofday(&after, &dontcare);
	microsec = (after.tv_usec - before.tv_usec) +
		1000000 * (after.tv_sec - before.tv_sec);
	for (j = 0; j < 32; j += 2)
		printf("(%d) { %02x %02x %02x }\n", j, red[j], green[j], blue[j]);
	printf("%lld microseconds\n", microsec);
	return 0;
}


Return to article