analysis-tools.c 71 KB
Newer Older
1 2 3 4 5
/*
 * analysis-tools.c: 
 *
 * Author:
 *  Jukka-Pekka Iivonen <iivonen@iki.fi>
6 7
 *
 * (C) Copyright 2000 by Jukka-Pekka Iivonen <iivonen@iki.fi>
8 9 10 11 12 13
 */

#include <config.h>
#include <gnome.h>
#include <string.h>
#include <math.h>
14 15
#include "mathfunc.h"
#include "numbers.h"
16 17 18
#include "gnumeric.h"
#include "gnumeric-util.h"
#include "dialogs.h"
19
#include "parse-util.h"
Arturo Espinosa's avatar
Arturo Espinosa committed
20
#include "tools.h"
21
#include "value.h"
22
#include "regression.h"
23
#include "workbook.h"
24 25 26 27 28 29

typedef struct {
        GSList  *array;
        float_t sum;
        float_t sum2;    /* square of the sum */
        float_t sqrsum;
30 31
        float_t min;
        float_t max;
32 33 34 35 36 37 38
        int     n;
} data_set_t;



/***** Some general routines ***********************************************/

39
static int
40
int_compare (const void *px, const void *py)
41
{
42 43 44
	const int *x = px;
	const int *y = py;
	
45 46 47 48 49 50 51 52
        if (*x < *y)
	        return -1;
	else if (*x == *y)
	        return 0;
	else
	        return 1;
}

53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74
static gint
float_compare (const float_t *a, const float_t *b)
{
        if (*a < *b)
                return -1;
        else if (*a == *b)
                return 0;
        else
                return 1;
}

static gint
float_compare_desc (const float_t *a, const float_t *b)
{
        if (*a < *b)
                return 1;
        else if (*a == *b)
                return 0;
        else
                return -1;
}

75
Cell *
76
set_cell (data_analysis_output_t *dao, int col, int row, char *text)
77 78
{
        Cell *cell;
79 80 81 82 83

	/* Check that the output is in the given range */
	if (dao->type == RangeOutput && (col >= dao->cols || row >= dao->rows))
	        return NULL;

84 85 86
	cell = sheet_cell_fetch (dao->sheet, dao->start_col+col, 
				 dao->start_row+row);
	sheet_cell_set_text (cell, text);
87 88 89 90

	return cell;
}

91
static void
Arturo Espinosa's avatar
Arturo Espinosa committed
92
get_data (Sheet *sheet, Range *range, data_set_t *data)
93 94 95 96 97 98 99 100 101 102 103 104 105
{
        gpointer p;
	Cell     *cell;
	Value    *v;
	float_t  x;
	int      row, col;

	data->sum = 0;
	data->sum2 = 0;
	data->sqrsum = 0;
	data->n = 0;
	data->array = NULL;

106 107
	for (col=range->start.col; col<=range->end.col; col++)
	        for (row=range->start.row; row<=range->end.row; row++) {
Arturo Espinosa's avatar
Arturo Espinosa committed
108
		        cell = sheet_cell_get (sheet, col, row);
109 110
			if (cell != NULL && cell->value != NULL) {
			        v = cell->value;
Arturo Espinosa's avatar
Arturo Espinosa committed
111
				if (VALUE_IS_NUMBER (v))
112 113 114 115
				        x = value_get_as_float (v);
				else
				        x = 0;

Arturo Espinosa's avatar
Arturo Espinosa committed
116 117 118
				p = g_new (float_t, 1);
				* ( (float_t *) p) = x;
				data->array = g_slist_append (data->array, p);
119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135
				data->sum += x;
				data->sqrsum += x*x;
				if (data->n == 0) {
				        data->min = x;
					data->max = x;
				} else {
				        if (data->min > x)
					        data->min = x;
					if (data->max < x)
					        data->max = x;
				}
				data->n++;
			}
		}

	data->sum2 = data->sum * data->sum;
}
136 137

static void
Arturo Espinosa's avatar
Arturo Espinosa committed
138
get_data_groupped_by_columns (Sheet *sheet, Range *range, int col,
139 140 141 142 143 144 145 146 147 148 149 150 151 152
			     data_set_t *data)
{
        gpointer p;
	Cell     *cell;
	Value    *v;
	float_t  x;
	int      row;

	data->sum = 0;
	data->sum2 = 0;
	data->sqrsum = 0;
	data->n = 0;
	data->array = NULL;

153
	for (row=range->start.row; row<=range->end.row; row++) {
Arturo Espinosa's avatar
Arturo Espinosa committed
154
	       cell = sheet_cell_get (sheet, col, row);
155 156
	       if (cell != NULL && cell->value != NULL) {
		       v = cell->value;
Arturo Espinosa's avatar
Arturo Espinosa committed
157
		       if (VALUE_IS_NUMBER (v))
158 159 160 161
			       x = value_get_as_float (v);
		       else
			       x = 0;

Arturo Espinosa's avatar
Arturo Espinosa committed
162 163 164
		       p = g_new (float_t, 1);
		       * ( (float_t *) p) = x;
		       data->array = g_slist_append (data->array, p);
165 166
		       data->sum += x;
		       data->sqrsum += x*x;
167 168 169 170 171 172 173 174 175
		       if (data->n == 0) {
			       data->min = x;
			       data->max = x;
		       } else {
			       if (data->min > x)
				       data->min = x;
			       if (data->max < x)
				       data->max = x;
		       }
176 177 178 179 180 181 182 183
		       data->n++;
	       }
	}

	data->sum2 = data->sum * data->sum;
}

static void
Arturo Espinosa's avatar
Arturo Espinosa committed
184
get_data_groupped_by_rows (Sheet *sheet, Range *range, int row,
185 186 187 188 189 190 191 192 193 194 195 196 197 198
			  data_set_t *data)
{
        gpointer p;
	Cell     *cell;
	Value    *v;
	float_t  x;
	int      col;

	data->sum = 0;
	data->sum2 = 0;
	data->sqrsum = 0;
	data->n = 0;
	data->array = NULL;

199
	for (col=range->start.col; col<=range->end.col; col++) {
Arturo Espinosa's avatar
Arturo Espinosa committed
200
	       cell = sheet_cell_get (sheet, col, row);
201 202
	       if (cell != NULL && cell->value != NULL) {
		       v = cell->value;
Arturo Espinosa's avatar
Arturo Espinosa committed
203
		       if (VALUE_IS_NUMBER (v))
204 205 206 207
			       x = value_get_as_float (v);
		       else
			       x = 0;

Arturo Espinosa's avatar
Arturo Espinosa committed
208 209 210
		       p = g_new (float_t, 1);
		       * ( (float_t *) p) = x;
		       data->array = g_slist_append (data->array, p);
211 212 213 214 215 216 217 218 219 220
		       data->sum += x;
		       data->sqrsum += x*x;
		       data->n++;
	       }
	}

	data->sum2 = data->sum * data->sum;
}

static void
Arturo Espinosa's avatar
Arturo Espinosa committed
221
free_data_set (data_set_t *data)
222 223 224 225
{
        GSList *current = data->array;

	while (current != NULL) {
Arturo Espinosa's avatar
Arturo Espinosa committed
226
	        g_free (current->data);
227 228 229
		current=current->next;
	}

Arturo Espinosa's avatar
Arturo Espinosa committed
230
	g_slist_free (data->array);
231 232
}

233
void
Arturo Espinosa's avatar
Arturo Espinosa committed
234
prepare_output (Workbook *wb, data_analysis_output_t *dao, char *name)
235
{
236 237
	char *unique_name;
	
238
	if (dao->type == NewSheetOutput) {
239 240 241
		unique_name = workbook_sheet_get_free_name (wb, name, FALSE);
	        dao->sheet = sheet_new (wb, unique_name);
		g_free (unique_name);
242
		dao->start_col = dao->start_row = 0;
Arturo Espinosa's avatar
Arturo Espinosa committed
243
		workbook_attach_sheet (wb, dao->sheet);
244 245
	} else if (dao->type == NewWorkbookOutput) {
		wb = workbook_new ();
Arturo Espinosa's avatar
Arturo Espinosa committed
246
		dao->sheet = sheet_new (wb, name);
247
		dao->start_col = dao->start_row = 0;
Arturo Espinosa's avatar
Arturo Espinosa committed
248
		workbook_attach_sheet (wb, dao->sheet);
249 250 251 252
		gtk_widget_show (wb->toplevel);
	}
}

253
void
254 255 256 257 258 259 260 261 262 263 264 265 266 267
autofit_column (data_analysis_output_t *dao, int col)
{
        int ideal_size, actual_col;

	if (dao->type == NewSheetOutput ||
	    dao->type == NewWorkbookOutput)
	        actual_col = col;
	else
	        actual_col = dao->start_col + col;

	ideal_size = sheet_col_size_fit_pixels (dao->sheet, actual_col);
	if (ideal_size == 0)
	        return;

Jody Goldberg's avatar
Jody Goldberg committed
268 269
	sheet_col_set_size_pixels (dao->sheet, actual_col, ideal_size, TRUE);
	sheet_recompute_spans_for_col (dao->sheet, col);
270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293
}

static void
set_italic (data_analysis_output_t *dao, int col1, int row1,
	    int col2, int row2)
{
	MStyle *mstyle = mstyle_new ();
	Range  range;
	int    col_inc, row_inc;

	if (dao->type == RangeOutput) {
	        col_inc = dao->start_col;
		row_inc = dao->start_row;
	} else
	        col_inc = row_inc= 0;

	range.start.col = col1 + col_inc;
	range.start.row = row1 + row_inc;
	range.end.col   = col2 + col_inc;
	range.end.row   = row2 + row_inc;

	mstyle_set_font_italic (mstyle, TRUE);
	sheet_style_attach (dao->sheet, range, mstyle);
}
294 295 296 297 298 299 300 301 302 303 304 305

/************* Correlation Tool *******************************************
 *
 * The correlation tool calculates the correlation coefficient of two
 * data sets.  The two data sets can be groupped by rows or by columns.
 * The results are given in a table which can be printed out in a new
 * sheet, in a new workbook, or simply into an existing sheet.
 *
 **/


static float_t
Arturo Espinosa's avatar
Arturo Espinosa committed
306
correl (data_set_t *set_one, data_set_t *set_two, int *error_flag)
307 308 309 310 311 312 313 314 315 316
{
        GSList  *current_one, *current_two;
	float_t sum_xy = 0, c=0;
	float_t tmp;

	*error_flag = 0;
	current_one = set_one->array;
	current_two = set_two->array;

	while (current_one != NULL && current_two != NULL) {
Arturo Espinosa's avatar
Arturo Espinosa committed
317 318
	        sum_xy += * ( (float_t *) current_one->data) *
		  * ( (float_t *) current_two->data);
319 320 321 322 323 324 325 326 327 328 329 330 331
	        current_one = current_one->next;
	        current_two = current_two->next;
	}

	if (current_one != NULL || current_two != NULL)
	        *error_flag = 1;
	else {
		tmp = (set_one->sqrsum - (set_one->sum2 / set_one->n)) *
		  (set_two->sqrsum - (set_two->sum2 / set_two->n));
		if (tmp == 0)
		        *error_flag = 2;
		else
		        c = (sum_xy - (set_one->sum*set_two->sum/set_one->n)) /
Arturo Espinosa's avatar
Arturo Espinosa committed
332
			     sqrt (tmp);
333 334 335 336 337 338 339 340 341
	}

	return c;
}


/* If columns_flag is set, the data entries are groupped by columns 
 * otherwise by rows.
 */
342
int
343
correlation_tool (Workbook *wb, Sheet *sheet, 
344 345
		  Range *input_range, int columns_flag,
		  data_analysis_output_t *dao)
346 347 348 349 350 351 352
{
        data_set_t *data_sets;
	char       buf[256];
	Cell       *cell;
	int        vars, cols, rows, col, row, i;
	int        error;

353 354
	cols = input_range->end.col - input_range->start.col + 1;
	rows = input_range->end.row - input_range->start.row + 1;
355

356
	prepare_output (wb, dao, _("Correlations"));
357

358
	set_cell (dao, 0, 0, " ");
359 360 361

	if (columns_flag) {
	        vars = cols;
362 363 364 365 366
		if (dao->labels_flag) {
		        rows--;
			for (col=0; col<vars; col++) {
			        char *s;
			        cell = sheet_cell_get
367 368
				  (sheet, input_range->start.col+col, 
				   input_range->start.row);
369
				if (cell != NULL && cell->value != NULL) {
Arturo Espinosa's avatar
Arturo Espinosa committed
370
				        s = value_get_as_string (cell->value);
371 372 373 374 375
				        set_cell (dao, 0, col+1, s);
				        set_cell (dao, col+1, 0, s);
				} else
				        return 1;
			}
376
			input_range->start.row++;
377 378
		} else
		        for (col=0; col<vars; col++) {
379
			        sprintf (buf, _("Column %d"), col+1);
380 381 382
				set_cell (dao, 0, col+1, buf);
				set_cell (dao, col+1, 0, buf);
			}
383

Arturo Espinosa's avatar
Arturo Espinosa committed
384
		data_sets = g_new (data_set_t, vars);
385
		for (i=0; i<vars; i++)
Arturo Espinosa's avatar
Arturo Espinosa committed
386
		        get_data_groupped_by_columns (sheet,
387 388 389 390
						     input_range, i, 
						     &data_sets[i]);
	} else {
	        vars = rows;
391 392 393 394 395
		if (dao->labels_flag) {
		        cols--;
			for (col=0; col<vars; col++) {
			        char *s;
			        cell = sheet_cell_get
396 397
				  (sheet, input_range->start.col,
				   input_range->start.row+col);
398
				if (cell != NULL && cell->value != NULL) {
Arturo Espinosa's avatar
Arturo Espinosa committed
399
				        s = value_get_as_string (cell->value);
400 401 402 403 404
				        set_cell (dao, 0, col+1, s);
				        set_cell (dao, col+1, 0, s);
				} else
				        return 1;
			}
405
			input_range->start.col++;
406 407
		} else 
		        for (col=0; col<vars; col++) {
408
			        sprintf (buf, _("Row %d"), col+1);
409 410 411 412
				set_cell (dao, 0, col+1, buf);
				set_cell (dao, col+1, 0, buf);
			}
 
Arturo Espinosa's avatar
Arturo Espinosa committed
413
		data_sets = g_new (data_set_t, vars);
414 415

		for (i=0; i<vars; i++)
Arturo Espinosa's avatar
Arturo Espinosa committed
416
		        get_data_groupped_by_rows (sheet,
417 418 419 420 421 422 423
						  input_range, i, 
						  &data_sets[i]);
	}

	for (row=0; row<vars; row++) {
		  for (col=0; col<vars; col++) {
		        if (row == col) {
424
			        set_cell (dao, col+1, row+1, "1");
425 426
				break;
			} else {
Arturo Espinosa's avatar
Arturo Espinosa committed
427
			        sprintf (buf, "%f", correl (&data_sets[col],
428 429 430
							  &data_sets[row],
							  &error));
				if (error)
431
				        set_cell (dao, col+1, row+1, _("#N/A"));
432
				else
433
				        set_cell (dao, col+1, row+1, buf);
434 435 436 437 438
			}
		}
	}

	for (i=0; i<vars; i++)
Arturo Espinosa's avatar
Arturo Espinosa committed
439
	        free_data_set (&data_sets[i]);
440
	g_free (data_sets);
441

442 443 444 445 446 447 448
	set_italic (dao, 0, 0, 0, vars+1);
	set_italic (dao, 0, 0, vars+1, 0);

	for (col=0; col<=vars; col++)
	        autofit_column (dao, col);

	sheet_set_dirty (dao->sheet, TRUE);
Jody Goldberg's avatar
Jody Goldberg committed
449
	sheet_update (sheet);
450

451
	return 0;
452 453 454 455 456 457 458 459 460 461 462 463 464 465 466
}



/************* Covariance Tool ********************************************
 *
 * The covariance tool calculates the covariance of two data sets.
 * The two data sets can be groupped by rows or by columns.  The
 * results are given in a table which can be printed out in a new
 * sheet, in a new workbook, or simply into an existing sheet.
 *
 **/


static float_t
Arturo Espinosa's avatar
Arturo Espinosa committed
467
covar (data_set_t *set_one, data_set_t *set_two, int *error_flag)
468 469 470 471 472 473 474 475 476 477 478 479 480
{
        GSList  *current_one, *current_two;
	float_t sum = 0, c=0;
	float_t mean1, mean2, x, y;

	*error_flag = 0;
	current_one = set_one->array;
	current_two = set_two->array;

	mean1 = set_one->sum / set_one->n;
	mean2 = set_two->sum / set_two->n;

	while (current_one != NULL && current_two != NULL) {
Arturo Espinosa's avatar
Arturo Espinosa committed
481 482
	        x = * ( (float_t *) current_one->data);
	        y = * ( (float_t *) current_two->data);
483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498
	        sum += (x - mean1) * (y - mean2);
	        current_one = current_one->next;
	        current_two = current_two->next;
	}

	if (current_one != NULL || current_two != NULL)
	        *error_flag = 1;

	c = sum / set_one->n;

	return c;
}

/* If columns_flag is set, the data entries are groupped by columns 
 * otherwise by rows.
 */
499
int
500
covariance_tool (Workbook *wb, Sheet *sheet, 
501 502
		 Range *input_range, int columns_flag,
		 data_analysis_output_t *dao)
503 504 505 506 507 508 509
{
        data_set_t *data_sets;
	char       buf[256];
	Cell       *cell;
	int        vars, cols, rows, col, row, i;
	int        error;

510
	prepare_output (wb, dao, _("Covariances"));
511

512 513
	cols = input_range->end.col - input_range->start.col + 1;
	rows = input_range->end.row - input_range->start.row + 1;
514

515
	set_cell (dao, 0, 0, " ");
516 517 518

	if (columns_flag) {
	        vars = cols;
519 520 521 522 523
		if (dao->labels_flag) {
		        rows--;
			for (col=0; col<vars; col++) {
			        char *s;
			        cell = sheet_cell_get
524 525
				  (sheet, input_range->start.col+col, 
				   input_range->start.row);
526
				if (cell != NULL && cell->value != NULL) {
Arturo Espinosa's avatar
Arturo Espinosa committed
527
				        s = value_get_as_string (cell->value);
528 529 530 531 532
				        set_cell (dao, 0, col+1, s);
				        set_cell (dao, col+1, 0, s);
				} else
				        return 1;
			}
533
			input_range->start.row++;
534 535
		} else
		        for (col=0; col<vars; col++) {
536
			        sprintf (buf, _("Column %d"), col+1);
537 538 539
				set_cell (dao, 0, col+1, buf);
				set_cell (dao, col+1, 0, buf);
			}
540

Arturo Espinosa's avatar
Arturo Espinosa committed
541
		data_sets = g_new (data_set_t, vars);
542
		for (i=0; i<vars; i++)
Arturo Espinosa's avatar
Arturo Espinosa committed
543
		        get_data_groupped_by_columns (sheet,
544 545 546 547
						     input_range, i, 
						     &data_sets[i]);
	} else {
	        vars = rows;
548 549 550 551 552
		if (dao->labels_flag) {
		        cols--;
			for (col=0; col<vars; col++) {
			        char *s;
			        cell = sheet_cell_get
553 554
				  (sheet, input_range->start.col,
				   input_range->start.row+col);
555
				if (cell != NULL && cell->value != NULL) {
Arturo Espinosa's avatar
Arturo Espinosa committed
556
				        s = value_get_as_string (cell->value);
557 558 559 560 561
				        set_cell (dao, 0, col+1, s);
				        set_cell (dao, col+1, 0, s);
				} else
				        return 1;
			}
562
			input_range->start.col++;
563 564
		} else 
		        for (col=0; col<vars; col++) {
565
			        sprintf (buf, _("Row %d"), col+1);
566 567 568 569
				set_cell (dao, 0, col+1, buf);
				set_cell (dao, col+1, 0, buf);
			}
 
Arturo Espinosa's avatar
Arturo Espinosa committed
570
		data_sets = g_new (data_set_t, vars);
571 572

		for (i=0; i<vars; i++)
Arturo Espinosa's avatar
Arturo Espinosa committed
573
		        get_data_groupped_by_rows (sheet,
574 575 576 577 578 579
						  input_range, i, 
						  &data_sets[i]);
	}

	for (row=0; row<vars; row++) {
		  for (col=0; col<vars; col++) {
580
		        if (row < col)
581
				break;
582
			else {
Arturo Espinosa's avatar
Arturo Espinosa committed
583
			        sprintf (buf, "%f", covar (&data_sets[col],
584 585 586
							 &data_sets[row],
							  &error));
				if (error)
587
				        set_cell (dao, col+1, row+1, _("#N/A"));
588
				else
589
				        set_cell (dao, col+1, row+1, buf);
590 591 592 593 594
			}
		}
	}

	for (i=0; i<vars; i++)
Arturo Espinosa's avatar
Arturo Espinosa committed
595
	        free_data_set (&data_sets[i]);
596
	g_free (data_sets);
597

598 599 600 601 602 603 604
	set_italic (dao, 0, 0, 0, vars+1);
	set_italic (dao, 0, 0, vars+1, 0);

	for (col=0; col<=vars; col++)
	        autofit_column (dao, col);

	sheet_set_dirty (dao->sheet, TRUE);
Jody Goldberg's avatar
Jody Goldberg committed
605
	sheet_update (sheet);
606

607
	return 0;
608
}
609 610 611 612 613 614 615 616 617 618 619 620 621 622


/************* Descriptive Statistics Tool *******************************
 *
 * Descriptive Statistics Tool calculates some useful statistical
 * information such as the mean, standard deviation, sample variance,
 * skewness, kurtosis, and standard error about the given variables.
 * The results are given in a table which can be printed out in a new
 * sheet, in a new workbook, or simply into an existing sheet.
 *
 **/


static float_t
Arturo Espinosa's avatar
Arturo Espinosa committed
623
kurt (data_set_t *data, int *error_flag)
624 625 626 627 628 629 630 631 632 633 634 635 636 637
{
        GSList  *current;
	float_t sum = 0;
	float_t mean, stdev, x;
        float_t num, dem, d;
	float_t n = data->n;

	if (n < 4) {
	        *error_flag = 1;
		return 0;
	} else
	        *error_flag = 0;

	mean = data->sum / n;
Arturo Espinosa's avatar
Arturo Espinosa committed
638
	stdev = sqrt ( (data->sqrsum - data->sum2/n) / (n - 1));
639 640 641

	current = data->array;
	while (current != NULL) {
Arturo Espinosa's avatar
Arturo Espinosa committed
642
	        x = * ( (float_t *) current->data);
643 644 645 646 647 648 649
	        x = (x - mean) / stdev;
		sum += (x * x) * (x * x);
	        current = current->next;
	}

	num = n * (n + 1);
	dem = (n - 1) * (n - 2) * (n - 3);
Arturo Espinosa's avatar
Arturo Espinosa committed
650
	d = (3 * (n - 1) * (n - 1)) / ( (n - 2) * (n - 3));
651 652 653 654 655

	return sum * (num / dem) - d;
}

static float_t
Arturo Espinosa's avatar
Arturo Espinosa committed
656
skew (data_set_t *data, int *error_flag)
657 658 659 660 661 662 663 664 665 666 667 668 669
{
        GSList  *current;
	float_t x3, m, s, x, dxn;
	float_t n = data->n;

	if (n < 3) {
	        *error_flag = 1;
		return 0;
	} else
	        *error_flag = 0;

	x3 = 0;
	m = data->sum / n;
Arturo Espinosa's avatar
Arturo Espinosa committed
670
	s = sqrt ( (data->sqrsum - data->sum2/n) / (n - 1));
671 672 673

	current = data->array;
	while (current != NULL) {
Arturo Espinosa's avatar
Arturo Espinosa committed
674
	        x = * ( (float_t *) current->data);
675 676 677 678 679
		dxn = (x - m) / s;
		x3 += dxn * dxn * dxn;
	        current = current->next;
	}

Arturo Espinosa's avatar
Arturo Espinosa committed
680
	return ( (x3 * n) / (n - 1)) / (n - 2);
681 682 683
}

static void
Arturo Espinosa's avatar
Arturo Espinosa committed
684
summary_statistics (Workbook *wb, data_set_t *data_set, int vars,
685 686 687 688 689 690
		   data_analysis_output_t *dao)
{
        char    buf[256];
	float_t x;
	int     col, error;

691
	prepare_output (wb, dao, _("Summary Statistics"));
692

693
        set_cell (dao, 0, 0, " ");
694
	for (col=0; col<vars; col++) {
695
	        sprintf (buf, _("Column %d"), col+1);
696 697 698
		set_cell (dao, col+1, 0, buf);
	}

699 700 701 702 703 704 705 706 707 708 709 710 711
        set_cell (dao, 0, 1, _("Mean"));
        set_cell (dao, 0, 2, _("Standard Error"));
        set_cell (dao, 0, 3, _("Median"));
        set_cell (dao, 0, 4, _("Mode"));
        set_cell (dao, 0, 5, _("Standard Deviation"));
        set_cell (dao, 0, 6, _("Sample Variance"));
        set_cell (dao, 0, 7, _("Kurtosis"));
        set_cell (dao, 0, 8, _("Skewness"));
        set_cell (dao, 0, 9, _("Range"));
        set_cell (dao, 0, 10, _("Minimum"));
        set_cell (dao, 0, 11, _("Maximum"));
        set_cell (dao, 0, 12, _("Sum"));
        set_cell (dao, 0, 13, _("Count"));
712 713

	for (col=0; col<vars; col++) {
714 715 716
	        float_t var, stdev;

		var = (data_set[col].sqrsum - 
717 718
			       data_set[col].sum2/data_set[col].n) /
		        (data_set[col].n - 1);
Arturo Espinosa's avatar
Arturo Espinosa committed
719
		stdev = sqrt (var);
720 721

		data_set[col].array = 
Arturo Espinosa's avatar
Arturo Espinosa committed
722
		        g_slist_sort (data_set[col].array, 
723 724 725
				     (GCompareFunc) float_compare);

	        /* Mean */
Arturo Espinosa's avatar
Arturo Espinosa committed
726
	        sprintf (buf, "%f", data_set[col].sum / data_set[col].n);
727 728 729
		set_cell (dao, col+1, 1, buf);

		/* Standard Error */
Arturo Espinosa's avatar
Arturo Espinosa committed
730
	        sprintf (buf, "%f", stdev / sqrt (data_set[col].n));
731 732 733 734
		set_cell (dao, col+1, 2, buf);

		/* Median */
		if (data_set[col].n % 2 == 1)
Arturo Espinosa's avatar
Arturo Espinosa committed
735
		        x = * ( (float_t *)g_slist_nth_data (data_set[col].array, 
736 737
							  data_set[col].n/2));
		else {
Arturo Espinosa's avatar
Arturo Espinosa committed
738
		        x=* ( (float_t *)g_slist_nth_data (data_set[col].array, 
739
							data_set[col].n/2));
Arturo Espinosa's avatar
Arturo Espinosa committed
740
		        x+=* ( (float_t *)g_slist_nth_data (data_set[col].array, 
741 742 743
							 data_set[col].n/2-1));
			x /= 2;
		}
Arturo Espinosa's avatar
Arturo Espinosa committed
744
	        sprintf (buf, "%f", x);
745 746 747 748 749 750
		set_cell (dao, col+1, 3, buf);

		/* Mode */
		/* TODO */

		/* Standard Deviation */
Arturo Espinosa's avatar
Arturo Espinosa committed
751
	        sprintf (buf, "%f", stdev);
752 753 754
		set_cell (dao, col+1, 5, buf);

		/* Sample Variance */
Arturo Espinosa's avatar
Arturo Espinosa committed
755
	        sprintf (buf, "%f", var);
756 757 758
		set_cell (dao, col+1, 6, buf);

		/* Kurtosis */
759
		if (data_set[col].n > 3) {
Arturo Espinosa's avatar
Arturo Espinosa committed
760 761
		        x = kurt (&data_set[col], &error);
			sprintf (buf, "%f", x);
762
		} else
763
		        sprintf (buf, _("#N/A"));
764 765 766
		set_cell (dao, col+1, 7, buf);

		/* Skewness */
767
		if (data_set[col].n > 2) {
Arturo Espinosa's avatar
Arturo Espinosa committed
768 769
		        x = skew (&data_set[col], &error);
			sprintf (buf, "%f", x);
770
		} else
771
		        sprintf (buf, _("#N/A"));
772 773 774
		set_cell (dao, col+1, 8, buf);

		/* Range */
Arturo Espinosa's avatar
Arturo Espinosa committed
775
	        sprintf (buf, "%f", data_set[col].max - data_set[col].min);
776 777 778
		set_cell (dao, col+1, 9, buf);

		/* Minimum */
Arturo Espinosa's avatar
Arturo Espinosa committed
779
	        sprintf (buf, "%f", data_set[col].min);
780 781 782
		set_cell (dao, col+1, 10, buf);

		/* Maximum */
Arturo Espinosa's avatar
Arturo Espinosa committed
783
	        sprintf (buf, "%f", data_set[col].max);
784 785 786
		set_cell (dao, col+1, 11, buf);

		/* Sum */
Arturo Espinosa's avatar
Arturo Espinosa committed
787
	        sprintf (buf, "%f", data_set[col].sum);
788 789 790
		set_cell (dao, col+1, 12, buf);

		/* Count */
Arturo Espinosa's avatar
Arturo Espinosa committed
791
	        sprintf (buf, "%d", data_set[col].n);
792 793 794 795 796
		set_cell (dao, col+1, 13, buf);
	}
}

static void
Arturo Espinosa's avatar
Arturo Espinosa committed
797
confidence_level (Workbook *wb, data_set_t *data_set, int vars, float_t c_level,
798 799
		 data_analysis_output_t *dao)
{
800
        float_t x;
801 802 803
        char    buf[256];
        int col;

804
	prepare_output (wb, dao, _("Confidence Level"));
805 806

	for (col=0; col<vars; col++) {
Arturo Espinosa's avatar
Arturo Espinosa committed
807
	        float_t stdev = sqrt ( (data_set[col].sqrsum - 
808 809 810
				      data_set[col].sum2/data_set[col].n) /
				     (data_set[col].n - 1));

811
	        sprintf (buf, _("Column %d"), col+1);
812
		set_cell (dao, col+1, 0, buf);
813

Arturo Espinosa's avatar
Arturo Espinosa committed
814 815
		x = -qnorm (c_level / 2, 0, 1) * (stdev/sqrt (data_set[col].n));
		sprintf (buf, "%f", x);
816
		set_cell (dao, col+1, 2, buf);
817
	}
818
	sprintf (buf, _("Confidence Level (%g%%)"), c_level*100);
819
        set_cell (dao, 0, 2, buf);
820 821 822
}

static void
Arturo Espinosa's avatar
Arturo Espinosa committed
823
kth_largest (Workbook *wb, data_set_t *data_set, int vars, int k,
824 825 826 827 828 829
	    data_analysis_output_t *dao)
{
        float_t x;
        char    buf[256];
        int     col;

830
	prepare_output (wb, dao, _("Kth Largest"));
831 832

	for (col=0; col<vars; col++) {
833
	        sprintf (buf, _("Column %d"), col+1);
834 835 836
		set_cell (dao, col+1, 0, buf);

		data_set[col].array = 
Arturo Espinosa's avatar
Arturo Espinosa committed
837
		        g_slist_sort (data_set[col].array, 
838 839
				     (GCompareFunc) float_compare_desc);

Arturo Espinosa's avatar
Arturo Espinosa committed
840 841 842
		x = * ( (float_t *) g_slist_nth_data (data_set[col].array, k-1));
		sprintf (buf, "%f", x);
		set_cell (dao, col+1, 2, buf);
843
	}
844
	sprintf (buf, _("Largest (%d)"), k);
845 846 847 848
        set_cell (dao, 0, 2, buf);
}

static void
Arturo Espinosa's avatar
Arturo Espinosa committed
849
kth_smallest (Workbook *wb, data_set_t *data_set, int vars, int k,
850 851 852 853 854 855
	     data_analysis_output_t *dao)
{
        float_t x;
        char    buf[256];
        int     col;

856
	prepare_output (wb, dao, _("Kth Smallest"));
857 858

	for (col=0; col<vars; col++) {
859
	        sprintf (buf, _("Column %d"), col+1);
860 861 862
		set_cell (dao, col+1, 0, buf);

		data_set[col].array = 
Arturo Espinosa's avatar
Arturo Espinosa committed
863
		        g_slist_sort (data_set[col].array, 
864 865
				     (GCompareFunc) float_compare);

Arturo Espinosa's avatar
Arturo Espinosa committed
866 867 868
		x = * ( (float_t *) g_slist_nth_data (data_set[col].array, k-1));
		sprintf (buf, "%f", x);
		set_cell (dao, col+1, 2, buf);
869
	}
870
	sprintf (buf, _("Smallest (%d)"), k);
871 872 873 874 875
        set_cell (dao, 0, 2, buf);
}

/* Descriptive Statistics
 */
876
int
877 878 879 880 881 882
descriptive_stat_tool (Workbook *wb, Sheet *current_sheet, 
                       Range *input_range, int columns_flag,
		       descriptive_stat_tool_t *ds,
		       data_analysis_output_t *dao)
{
        data_set_t *data_sets;
883
        int        vars, cols, rows, i;
884

885 886
	cols = input_range->end.col - input_range->start.col + 1;
	rows = input_range->end.row - input_range->start.row + 1;
887 888 889

	if (columns_flag) {
	        vars = cols;
Arturo Espinosa's avatar
Arturo Espinosa committed
890
		data_sets = g_new (data_set_t, vars);
891
		for (i=0; i<vars; i++)
Arturo Espinosa's avatar
Arturo Espinosa committed
892
		        get_data_groupped_by_columns (current_sheet,
893 894 895 896
						     input_range, i, 
						     &data_sets[i]);
	} else {
	        vars = rows;
Arturo Espinosa's avatar
Arturo Espinosa committed
897
		data_sets = g_new (data_set_t, vars);
898
		for (i=0; i<vars; i++)
Arturo Espinosa's avatar
Arturo Espinosa committed
899
		        get_data_groupped_by_rows (current_sheet,
900 901 902 903
						  input_range, i, 
						  &data_sets[i]);
	}

904
        if (ds->summary_statistics) {
Arturo Espinosa's avatar
Arturo Espinosa committed
905
                summary_statistics (wb, data_sets, vars, dao);
906 907 908 909
		if (dao->type == RangeOutput)
		        dao->start_row += 15;
	}
        if (ds->confidence_level) {
Arturo Espinosa's avatar
Arturo Espinosa committed
910
                confidence_level (wb, data_sets, vars, ds->c_level, dao);
911 912 913 914
		if (dao->type == RangeOutput)
		        dao->start_row += 4;
	}
        if (ds->kth_largest) {
Arturo Espinosa's avatar
Arturo Espinosa committed
915
                kth_largest (wb, data_sets, vars, ds->k_largest, dao);
916 917 918
		if (dao->type == RangeOutput)
		        dao->start_row += 4;
	}
919
        if (ds->kth_smallest)
Arturo Espinosa's avatar
Arturo Espinosa committed
920
                kth_smallest (wb, data_sets, vars, ds->k_smallest, dao);
921 922

	for (i=0; i<vars; i++)
Arturo Espinosa's avatar
Arturo Espinosa committed
923
	        free_data_set (&data_sets[i]);
924
	g_free (data_sets);
925 926

	return 0;
927
}
928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954



/************* Sampling Tool *********************************************
 *
 * Sampling tool takes a sample from a given data set.  Sample can be
 * a random sample where a given number of data points are selected
 * randomly from the data set.  The sample can also be a periodic
 * sample where, for example, every fourth data element is selected to
 * the sample.  The results are given in a table which can be printed
 * out in a new sheet, in a new workbook, or simply into an existing
 * sheet.
 *
 **/


/* Returns 1 if error occured, for example random sample size is
 * larger than the data set.
 **/
int sampling_tool (Workbook *wb, Sheet *sheet, Range *input_range,
		   gboolean periodic_flag, int size,
		   data_analysis_output_t *dao)
{
        data_set_t data_set;
	char       buf[256];
	float_t    x;

955
	prepare_output (wb, dao, _("Sample"));
956

Arturo Espinosa's avatar
Arturo Espinosa committed
957
	get_data (sheet, input_range, &data_set);
958 959 960 961 962 963 964 965

	if (periodic_flag) {
	        GSList *current = data_set.array;
		int    counter = size-1;
		int    row = 0;

		while (current != NULL) {
		        if (++counter == size) {
Arturo Espinosa's avatar
Arturo Espinosa committed
966 967
			        x = * ( (float_t *) current->data);
				sprintf (buf, "%f", x);
968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983
				set_cell (dao, 0, row++, buf);
				counter = 0;
			}
		        current = current->next;
		}
	} else {
	        int     *index_tbl;
		int     i, n, x;
		GSList  *current = data_set.array;
		int     counter = 0;
		int     row=0;

	        if (size > data_set.n)
		        return 1;

		if (size <= data_set.n/2) {
Arturo Espinosa's avatar
Arturo Espinosa committed
984
		        index_tbl = g_new (int, size);
985 986 987

			for (i=0; i<size; i++) {
			try_again_1:
Arturo Espinosa's avatar
Arturo Espinosa committed
988
			        x = random_01 () * size;
989 990 991 992 993
				for (n=0; n<i; n++)
				        if (index_tbl[n] == x)
					        goto try_again_1;
				index_tbl[i] = x;
			}
Arturo Espinosa's avatar
Arturo Espinosa committed
994
			qsort (index_tbl, size, sizeof (int), 
995 996
			      int_compare);
			n = 0;
997
			while (n < size) {
998
			        if (counter++ == index_tbl[n]) {
Arturo Espinosa's avatar
Arturo Espinosa committed
999 1000
					sprintf (buf, "%f", 
						* ( (float_t *) current->data));
1001 1002 1003 1004 1005
					set_cell (dao, 0, row++, buf);
					++n;
				}
				current = current->next;
			}
Arturo Espinosa's avatar
Arturo Espinosa committed
1006
			g_free (index_tbl);
1007 1008
		} else {
		        if (data_set.n != size)
Arturo Espinosa's avatar
Arturo Espinosa committed
1009
			        index_tbl = g_new (int, data_set.n-size);
1010
			else {
Arturo Espinosa's avatar
Arturo Espinosa committed
1011
			        index_tbl = g_new (int, 1);
1012 1013 1014 1015 1016
				index_tbl[0] = -1;
			}

			for (i=0; i<data_set.n-size; i++) {
			try_again_2:
Arturo Espinosa's avatar
Arturo Espinosa committed
1017
			        x = random_01 () * size;
1018 1019 1020 1021 1022 1023
				for (n=0; n<i; n++)
				        if (index_tbl[n] == x)
					        goto try_again_2;
				index_tbl[i] = x;
			}
			if (data_set.n != size)
Arturo Espinosa's avatar
Arturo Espinosa committed
1024
			        qsort (index_tbl, size, sizeof (int), 
1025 1026 1027 1028 1029 1030
				      int_compare);
			n = 0;
			while (current != NULL) {
			        if (counter++ == index_tbl[n])
					++n;
				else {
Arturo Espinosa's avatar
Arturo Espinosa committed
1031 1032
					sprintf (buf, "%f", 
						* ( (float_t *) current->data));
1033 1034 1035 1036
					set_cell (dao, 0, row++, buf);
				}
				current = current->next;
			}
Arturo Espinosa's avatar
Arturo Espinosa committed
1037
			g_free (index_tbl);
1038 1039 1040 1041 1042 1043 1044 1045
		}
		
	}

	free_data_set (&data_set);

	return 0;
}
1046 1047 1048