source: trunk/Clp/src/CoinAbcHelperFunctions.cpp @ 1878

Last change on this file since 1878 was 1878, checked in by forrest, 7 years ago

minor changes to implement Aboca

File size: 64.9 KB
Line 
1/* $Id: CoinAbcHelperFunctions.cpp 1817 2011-11-03 09:26:23Z forrest $ */
2// Copyright (C) 2003, International Business Machines
3// Corporation and others, Copyright (C) 2012, FasterCoin.  All Rights Reserved.
4// This code is licensed under the terms of the Eclipse Public License (EPL).
5
6#if CLP_HAS_ABC
7#include <cfloat>
8#include <cstdlib>
9#include <cmath>
10#include "CoinHelperFunctions.hpp"
11#include "CoinAbcHelperFunctions.hpp"
12#include "CoinTypes.hpp"
13#include "CoinFinite.hpp"
14#include "CoinAbcCommon.hpp"
15//#define AVX2 1
16#if AVX2==1
17#define set_const_v2df(bb,b) {double * temp=reinterpret_cast<double*>(&bb);temp[0]=b;temp[1]=b;}       
18#endif
19#if INLINE_SCATTER==0
20void 
21CoinAbcScatterUpdate(int number,CoinFactorizationDouble pivotValue,
22                     const CoinFactorizationDouble *  COIN_RESTRICT thisElement,
23                     const int *  COIN_RESTRICT thisIndex,
24                     CoinFactorizationDouble * COIN_RESTRICT region)
25{
26#if UNROLL_SCATTER==0
27  for (CoinBigIndex j=number-1 ; j >=0; j-- ) {
28    CoinSimplexInt iRow = thisIndex[j];
29    CoinFactorizationDouble regionValue = region[iRow];
30    CoinFactorizationDouble value = thisElement[j];
31    assert (value);
32    region[iRow] = regionValue - value * pivotValue;
33  }
34#elif UNROLL_SCATTER==1
35  if ((number&1)!=0) {
36    number--;
37    CoinSimplexInt iRow = thisIndex[number];
38    CoinFactorizationDouble regionValue = region[iRow];
39    CoinFactorizationDouble value = thisElement[number]; 
40    region[iRow] = regionValue - value * pivotValue;
41  }
42  for (CoinBigIndex j=number-1 ; j >=0; j-=2 ) {
43    CoinSimplexInt iRow0 = thisIndex[j];
44    CoinSimplexInt iRow1 = thisIndex[j-1];
45    CoinFactorizationDouble regionValue0 = region[iRow0];
46    CoinFactorizationDouble regionValue1 = region[iRow1];
47    region[iRow0] = regionValue0 - thisElement[j] * pivotValue;
48    region[iRow1] = regionValue1 - thisElement[j-1] * pivotValue;
49  }
50#elif UNROLL_SCATTER==2
51  if ((number&1)!=0) {
52    number--;
53    CoinSimplexInt iRow = thisIndex[number];
54    CoinFactorizationDouble regionValue = region[iRow];
55    CoinFactorizationDouble value = thisElement[number]; 
56    region[iRow] = regionValue - value * pivotValue;
57  }
58  if ((number&2)!=0) {
59    CoinSimplexInt iRow0 = thisIndex[number-1];
60    CoinFactorizationDouble regionValue0 = region[iRow0];
61    CoinFactorizationDouble value0 = thisElement[number-1]; 
62    CoinSimplexInt iRow1 = thisIndex[number-2];
63    CoinFactorizationDouble regionValue1 = region[iRow1];
64    CoinFactorizationDouble value1 = thisElement[number-2]; 
65    region[iRow0] = regionValue0 - value0 * pivotValue;
66    region[iRow1] = regionValue1 - value1 * pivotValue;
67    number-=2;
68  }
69  for (CoinBigIndex j=number-1 ; j >=0; j-=4 ) {
70    CoinSimplexInt iRow0 = thisIndex[j];
71    CoinSimplexInt iRow1 = thisIndex[j-1];
72    CoinFactorizationDouble regionValue0 = region[iRow0];
73    CoinFactorizationDouble regionValue1 = region[iRow1];
74    region[iRow0] = regionValue0 - thisElement[j] * pivotValue;
75    region[iRow1] = regionValue1 - thisElement[j-1] * pivotValue;
76    CoinSimplexInt iRow2 = thisIndex[j-2];
77    CoinSimplexInt iRow3 = thisIndex[j-3];
78    CoinFactorizationDouble regionValue2 = region[iRow2];
79    CoinFactorizationDouble regionValue3 = region[iRow3];
80    region[iRow2] = regionValue2 - thisElement[j-2] * pivotValue;
81    region[iRow3] = regionValue3 - thisElement[j-3] * pivotValue;
82  }
83#elif UNROLL_SCATTER==3
84  CoinSimplexInt iRow0;
85  CoinSimplexInt iRow1;
86  CoinFactorizationDouble regionValue0;
87  CoinFactorizationDouble regionValue1;
88  switch(static_cast<unsigned int>(number)) {
89  case 0:
90    break;
91  case 1:
92    iRow0 = thisIndex[0];
93    regionValue0 = region[iRow0];
94    region[iRow0] = regionValue0 - thisElement[0] * pivotValue;
95    break;
96  case 2:
97    iRow0 = thisIndex[0];
98    iRow1 = thisIndex[1];
99    regionValue0 = region[iRow0];
100    regionValue1 = region[iRow1];
101    region[iRow0] = regionValue0 - thisElement[0] * pivotValue;
102    region[iRow1] = regionValue1 - thisElement[1] * pivotValue;
103    break;
104  case 3:
105    iRow0 = thisIndex[0];
106    iRow1 = thisIndex[1];
107    regionValue0 = region[iRow0];
108    regionValue1 = region[iRow1];
109    region[iRow0] = regionValue0 - thisElement[0] * pivotValue;
110    region[iRow1] = regionValue1 - thisElement[1] * pivotValue;
111    iRow0 = thisIndex[2];
112    regionValue0 = region[iRow0];
113    region[iRow0] = regionValue0 - thisElement[2] * pivotValue;
114    break;
115  case 4:
116    iRow0 = thisIndex[0];
117    iRow1 = thisIndex[1];
118    regionValue0 = region[iRow0];
119    regionValue1 = region[iRow1];
120    region[iRow0] = regionValue0 - thisElement[0] * pivotValue;
121    region[iRow1] = regionValue1 - thisElement[1] * pivotValue;
122    iRow0 = thisIndex[2];
123    iRow1 = thisIndex[3];
124    regionValue0 = region[iRow0];
125    regionValue1 = region[iRow1];
126    region[iRow0] = regionValue0 - thisElement[2] * pivotValue;
127    region[iRow1] = regionValue1 - thisElement[3] * pivotValue;
128    break;
129  case 5:
130    iRow0 = thisIndex[0];
131    iRow1 = thisIndex[1];
132    regionValue0 = region[iRow0];
133    regionValue1 = region[iRow1];
134    region[iRow0] = regionValue0 - thisElement[0] * pivotValue;
135    region[iRow1] = regionValue1 - thisElement[1] * pivotValue;
136    iRow0 = thisIndex[2];
137    iRow1 = thisIndex[3];
138    regionValue0 = region[iRow0];
139    regionValue1 = region[iRow1];
140    region[iRow0] = regionValue0 - thisElement[2] * pivotValue;
141    region[iRow1] = regionValue1 - thisElement[3] * pivotValue;
142    iRow0 = thisIndex[4];
143    regionValue0 = region[iRow0];
144    region[iRow0] = regionValue0 - thisElement[4] * pivotValue;
145    break;
146  case 6:
147    iRow0 = thisIndex[0];
148    iRow1 = thisIndex[1];
149    regionValue0 = region[iRow0];
150    regionValue1 = region[iRow1];
151    region[iRow0] = regionValue0 - thisElement[0] * pivotValue;
152    region[iRow1] = regionValue1 - thisElement[1] * pivotValue;
153    iRow0 = thisIndex[2];
154    iRow1 = thisIndex[3];
155    regionValue0 = region[iRow0];
156    regionValue1 = region[iRow1];
157    region[iRow0] = regionValue0 - thisElement[2] * pivotValue;
158    region[iRow1] = regionValue1 - thisElement[3] * pivotValue;
159    iRow0 = thisIndex[4];
160    iRow1 = thisIndex[5];
161    regionValue0 = region[iRow0];
162    regionValue1 = region[iRow1];
163    region[iRow0] = regionValue0 - thisElement[4] * pivotValue;
164    region[iRow1] = regionValue1 - thisElement[5] * pivotValue;
165    break;
166  case 7:
167    iRow0 = thisIndex[0];
168    iRow1 = thisIndex[1];
169    regionValue0 = region[iRow0];
170    regionValue1 = region[iRow1];
171    region[iRow0] = regionValue0 - thisElement[0] * pivotValue;
172    region[iRow1] = regionValue1 - thisElement[1] * pivotValue;
173    iRow0 = thisIndex[2];
174    iRow1 = thisIndex[3];
175    regionValue0 = region[iRow0];
176    regionValue1 = region[iRow1];
177    region[iRow0] = regionValue0 - thisElement[2] * pivotValue;
178    region[iRow1] = regionValue1 - thisElement[3] * pivotValue;
179    iRow0 = thisIndex[4];
180    iRow1 = thisIndex[5];
181    regionValue0 = region[iRow0];
182    regionValue1 = region[iRow1];
183    region[iRow0] = regionValue0 - thisElement[4] * pivotValue;
184    region[iRow1] = regionValue1 - thisElement[5] * pivotValue;
185    iRow0 = thisIndex[6];
186    regionValue0 = region[iRow0];
187    region[iRow0] = regionValue0 - thisElement[6] * pivotValue;
188    break;
189  case 8:
190    iRow0 = thisIndex[0];
191    iRow1 = thisIndex[1];
192    regionValue0 = region[iRow0];
193    regionValue1 = region[iRow1];
194    region[iRow0] = regionValue0 - thisElement[0] * pivotValue;
195    region[iRow1] = regionValue1 - thisElement[1] * pivotValue;
196    iRow0 = thisIndex[2];
197    iRow1 = thisIndex[3];
198    regionValue0 = region[iRow0];
199    regionValue1 = region[iRow1];
200    region[iRow0] = regionValue0 - thisElement[2] * pivotValue;
201    region[iRow1] = regionValue1 - thisElement[3] * pivotValue;
202    iRow0 = thisIndex[4];
203    iRow1 = thisIndex[5];
204    regionValue0 = region[iRow0];
205    regionValue1 = region[iRow1];
206    region[iRow0] = regionValue0 - thisElement[4] * pivotValue;
207    region[iRow1] = regionValue1 - thisElement[5] * pivotValue;
208    iRow0 = thisIndex[6];
209    iRow1 = thisIndex[7];
210    regionValue0 = region[iRow0];
211    regionValue1 = region[iRow1];
212    region[iRow0] = regionValue0 - thisElement[6] * pivotValue;
213    region[iRow1] = regionValue1 - thisElement[7] * pivotValue;
214    break;
215  default:
216    if ((number&1)!=0) {
217      number--;
218      CoinSimplexInt iRow = thisIndex[number];
219      CoinFactorizationDouble regionValue = region[iRow];
220      CoinFactorizationDouble value = thisElement[number];
221      region[iRow] = regionValue - value * pivotValue;
222    }
223    for (CoinBigIndex j=number-1 ; j >=0; j-=2 ) {
224      CoinSimplexInt iRow0 = thisIndex[j];
225      CoinSimplexInt iRow1 = thisIndex[j-1];
226      CoinFactorizationDouble regionValue0 = region[iRow0];
227      CoinFactorizationDouble regionValue1 = region[iRow1];
228      region[iRow0] = regionValue0 - thisElement[j] * pivotValue;
229      region[iRow1] = regionValue1 - thisElement[j-1] * pivotValue;
230    }
231    break;
232  }
233#endif
234}
235#endif
236#if INLINE_GATHER==0
237CoinFactorizationDouble
238CoinAbcGatherUpdate(CoinSimplexInt number,
239                    const CoinFactorizationDouble *  COIN_RESTRICT thisElement,
240                    const int *  COIN_RESTRICT thisIndex,
241                    CoinFactorizationDouble * COIN_RESTRICT region)
242{
243#if UNROLL_GATHER==0
244  CoinFactorizationDouble pivotValue=0.0;
245  for (CoinBigIndex j = 0; j < number; j ++ ) {
246    CoinFactorizationDouble value = thisElement[j];
247    CoinSimplexInt jRow = thisIndex[j];
248    value *= region[jRow];
249    pivotValue -= value;
250  }
251  return pivotValue;
252#else
253#error code
254#endif
255}
256#endif
257#if INLINE_MULTIPLY_ADD==0
258void 
259CoinAbcMultiplyIndexed(int number,
260                       const CoinFactorizationDouble *  COIN_RESTRICT multiplier,
261                       const int *  COIN_RESTRICT thisIndex,
262                       CoinSimplexDouble * COIN_RESTRICT region)
263{
264#ifndef INTEL_COMPILER
265#pragma simd
266#endif
267#if UNROLL_MULTIPLY_ADD==0
268  for (CoinSimplexInt i = 0; i < number; i ++ ) {
269    CoinSimplexInt iRow = thisIndex[i];
270    CoinSimplexDouble value = region[iRow];
271    value *= multiplier[iRow];
272    region[iRow] = value;
273  }
274#else
275#error code
276#endif
277}
278void 
279CoinAbcMultiplyIndexed(int number,
280                       const long double *  COIN_RESTRICT multiplier,
281                       const int *  COIN_RESTRICT thisIndex,
282                       long double * COIN_RESTRICT region)
283{
284#ifndef INTEL_COMPILER
285#pragma simd
286#endif
287#if UNROLL_MULTIPLY_ADD==0
288  for (CoinSimplexInt i = 0; i < number; i ++ ) {
289    CoinSimplexInt iRow = thisIndex[i];
290    long double value = region[iRow];
291    value *= multiplier[iRow];
292    region[iRow] = value;
293  }
294#else
295#error code
296#endif
297}
298#endif
299double
300CoinAbcMaximumAbsElement(const double * region, int sizeIn)
301{
302  int size=sizeIn;
303     double maxValue = 0.0;
304     //printf("a\n");
305#ifndef INTEL_COMPILER
306#pragma simd
307#endif
308     /*cilk_*/ for (int i = 0; i < size; i++)
309          maxValue = CoinMax(maxValue, fabs(region[i]));
310     return maxValue;
311}
312void 
313CoinAbcMinMaxAbsElement(const double * region, int sizeIn,double & minimum , double & maximum)
314{
315  double minValue=minimum;
316  double maxValue=maximum;
317  int size=sizeIn;
318  //printf("b\n");
319#ifndef INTEL_COMPILER
320#pragma simd
321#endif
322     /*cilk_*/ for (int i=0;i<size;i++) {
323    double value=fabs(region[i]);
324    minValue=CoinMin(value,minValue);
325    maxValue=CoinMax(value,maxValue);
326  }
327  minimum=minValue;
328  maximum=maxValue;
329}
330void 
331CoinAbcMinMaxAbsNormalValues(const double * region, int sizeIn,double & minimum , double & maximum)
332{
333  int size=sizeIn;
334  double minValue=minimum;
335  double maxValue=maximum;
336#define NORMAL_LOW_VALUE 1.0e-8
337#define NORMAL_HIGH_VALUE 1.0e8
338  //printf("c\n");
339#ifndef INTEL_COMPILER
340#pragma simd
341#endif
342     /*cilk_*/ for (int i=0;i<size;i++) {
343    double value=fabs(region[i]);
344    if (value>NORMAL_LOW_VALUE&&value<NORMAL_HIGH_VALUE) {
345      minValue=CoinMin(value,minValue);
346      maxValue=CoinMax(value,maxValue);
347    }
348  }
349  minimum=minValue;
350  maximum=maxValue;
351}
352void 
353CoinAbcScale(double * region, double multiplier,int sizeIn)
354{
355  int size=sizeIn;
356  // used printf("d\n");
357#ifndef INTEL_COMPILER
358#pragma simd
359#endif
360#pragma cilk_grainsize=CILK_FOR_GRAINSIZE
361  cilk_for (int i=0;i<size;i++) {
362    region[i] *= multiplier;
363  }
364}
365void 
366CoinAbcScaleNormalValues(double * region, double multiplier,double killIfLessThanThis,int sizeIn)
367{
368  int size=sizeIn;
369     // used printf("e\n");
370#ifndef INTEL_COMPILER
371#pragma simd
372#endif
373#pragma cilk_grainsize=CILK_FOR_GRAINSIZE
374  cilk_for (int i=0;i<size;i++) {
375    double value=fabs(region[i]);
376    if (value>killIfLessThanThis) {
377      if (value!=COIN_DBL_MAX) {
378        value *= multiplier;
379        if (value>killIfLessThanThis)
380          region[i] *= multiplier;
381        else
382          region[i]=0.0;
383      }
384    } else {
385      region[i]=0.0;
386    }
387  }
388}
389// maximum fabs(region[i]) and then region[i]*=multiplier
390double 
391CoinAbcMaximumAbsElementAndScale(double * region, double multiplier,int sizeIn)
392{
393  double maxValue=0.0;
394  int size=sizeIn;
395  //printf("f\n");
396#ifndef INTEL_COMPILER
397#pragma simd
398#endif
399     /*cilk_*/ for (int i=0;i<size;i++) {
400    double value=fabs(region[i]);
401    region[i] *= multiplier;
402    maxValue=CoinMax(value,maxValue);
403  }
404  return maxValue;
405}
406void
407CoinAbcSetElements(double * region, int sizeIn, double value)
408{
409  int size=sizeIn;
410     printf("g\n");
411#ifndef INTEL_COMPILER
412#pragma simd
413#endif
414#pragma cilk_grainsize=CILK_FOR_GRAINSIZE
415     cilk_for (int i = 0; i < size; i++)
416          region[i] = value;
417}
418void
419CoinAbcMultiplyAdd(const double * region1, int sizeIn, double multiplier1,
420            double * regionChanged, double multiplier2)
421{
422  //printf("h\n");
423  int size=sizeIn;
424     if (multiplier1 == 1.0) {
425          if (multiplier2 == 1.0) {
426#ifndef INTEL_COMPILER
427#pragma simd
428#endif
429               /*cilk_*/ for (int i = 0; i < size; i++)
430                    regionChanged[i] = region1[i] + regionChanged[i];
431          } else if (multiplier2 == -1.0) {
432#ifndef INTEL_COMPILER
433#pragma simd
434#endif
435               /*cilk_*/ for (int i = 0; i < size; i++)
436                    regionChanged[i] = region1[i] - regionChanged[i];
437          } else if (multiplier2 == 0.0) {
438#ifndef INTEL_COMPILER
439#pragma simd
440#endif
441               /*cilk_*/ for (int i = 0; i < size; i++)
442                    regionChanged[i] = region1[i] ;
443          } else {
444#ifndef INTEL_COMPILER
445#pragma simd
446#endif
447               /*cilk_*/ for (int i = 0; i < size; i++)
448                    regionChanged[i] = region1[i] + multiplier2 * regionChanged[i];
449          }
450     } else if (multiplier1 == -1.0) {
451          if (multiplier2 == 1.0) {
452#ifndef INTEL_COMPILER
453#pragma simd
454#endif
455               /*cilk_*/ for (int i = 0; i < size; i++)
456                    regionChanged[i] = -region1[i] + regionChanged[i];
457          } else if (multiplier2 == -1.0) {
458#ifndef INTEL_COMPILER
459#pragma simd
460#endif
461               /*cilk_*/ for (int i = 0; i < size; i++)
462                    regionChanged[i] = -region1[i] - regionChanged[i];
463          } else if (multiplier2 == 0.0) {
464#ifndef INTEL_COMPILER
465#pragma simd
466#endif
467               /*cilk_*/ for (int i = 0; i < size; i++)
468                    regionChanged[i] = -region1[i] ;
469          } else {
470#ifndef INTEL_COMPILER
471#pragma simd
472#endif
473               /*cilk_*/ for (int i = 0; i < size; i++)
474                    regionChanged[i] = -region1[i] + multiplier2 * regionChanged[i];
475          }
476     } else if (multiplier1 == 0.0) {
477          if (multiplier2 == 1.0) {
478               // nothing to do
479          } else if (multiplier2 == -1.0) {
480#ifndef INTEL_COMPILER
481#pragma simd
482#endif
483               /*cilk_*/ for (int i = 0; i < size; i++)
484                    regionChanged[i] =  -regionChanged[i];
485          } else if (multiplier2 == 0.0) {
486#ifndef INTEL_COMPILER
487#pragma simd
488#endif
489               /*cilk_*/ for (int i = 0; i < size; i++)
490                    regionChanged[i] =  0.0;
491          } else {
492#ifndef INTEL_COMPILER
493#pragma simd
494#endif
495               /*cilk_*/ for (int i = 0; i < size; i++)
496                    regionChanged[i] =  multiplier2 * regionChanged[i];
497          }
498     } else {
499          if (multiplier2 == 1.0) {
500#ifndef INTEL_COMPILER
501#pragma simd
502#endif
503               /*cilk_*/ for (int i = 0; i < size; i++)
504                    regionChanged[i] = multiplier1 * region1[i] + regionChanged[i];
505          } else if (multiplier2 == -1.0) {
506#ifndef INTEL_COMPILER
507#pragma simd
508#endif
509               /*cilk_*/ for (int i = 0; i < size; i++)
510                    regionChanged[i] = multiplier1 * region1[i] - regionChanged[i];
511          } else if (multiplier2 == 0.0) {
512#ifndef INTEL_COMPILER
513#pragma simd
514#endif
515               /*cilk_*/ for (int i = 0; i < size; i++)
516                    regionChanged[i] = multiplier1 * region1[i] ;
517          } else {
518#ifndef INTEL_COMPILER
519#pragma simd
520#endif
521               /*cilk_*/ for (int i = 0; i < size; i++)
522                    regionChanged[i] = multiplier1 * region1[i] + multiplier2 * regionChanged[i];
523          }
524     }
525}
526double
527CoinAbcInnerProduct(const double * region1, int sizeIn, const double * region2)
528{
529  int size=sizeIn;
530  //printf("i\n");
531     double value = 0.0;
532#ifndef INTEL_COMPILER
533#pragma simd
534#endif
535     /*cilk_*/ for (int i = 0; i < size; i++)
536          value += region1[i] * region2[i];
537     return value;
538}
539void
540CoinAbcGetNorms(const double * region, int sizeIn, double & norm1, double & norm2)
541{
542  int size=sizeIn;
543  //printf("j\n");
544     norm1 = 0.0;
545     norm2 = 0.0;
546#ifndef INTEL_COMPILER
547#pragma simd
548#endif
549     /*cilk_*/ for (int i = 0; i < size; i++) {
550          norm2 += region[i] * region[i];
551          norm1 = CoinMax(norm1, fabs(region[i]));
552     }
553}
554// regionTo[index[i]]=regionFrom[i]
555void 
556CoinAbcScatterTo(const double * regionFrom, double * regionTo, const int * index,int numberIn)
557{
558  int number=numberIn;
559     // used printf("k\n");
560#ifndef INTEL_COMPILER
561#pragma simd
562#endif
563#pragma cilk_grainsize=CILK_FOR_GRAINSIZE
564  cilk_for (int i=0;i<number;i++) {
565    int k=index[i];
566    regionTo[k]=regionFrom[i];
567  }
568}
569// regionTo[i]=regionFrom[index[i]]
570void 
571CoinAbcGatherFrom(const double * regionFrom, double * regionTo, const int * index,int numberIn)
572{
573  int number=numberIn;
574     // used printf("l\n");
575#ifndef INTEL_COMPILER
576#pragma simd
577#endif
578#pragma cilk_grainsize=CILK_FOR_GRAINSIZE
579  cilk_for (int i=0;i<number;i++) {
580    int k=index[i];
581    regionTo[i]=regionFrom[k];
582  }
583}
584// regionTo[index[i]]=0.0
585void 
586CoinAbcScatterZeroTo(double * regionTo, const int * index,int numberIn)
587{
588  int number=numberIn;
589     // used printf("m\n");
590#ifndef INTEL_COMPILER
591#pragma simd
592#endif
593#pragma cilk_grainsize=CILK_FOR_GRAINSIZE
594  cilk_for (int i=0;i<number;i++) {
595    int k=index[i];
596    regionTo[k]=0.0;
597  }
598}
599// regionTo[indexScatter[indexList[i]]]=regionFrom[indexList[i]]
600void 
601CoinAbcScatterToList(const double * regionFrom, double * regionTo, 
602                   const int * indexList, const int * indexScatter ,int numberIn)
603{
604  int number=numberIn;
605  //printf("n\n");
606#ifndef INTEL_COMPILER
607#pragma simd
608#endif
609     /*cilk_*/ for (int i=0;i<number;i++) {
610    int j=indexList[i];
611    double value=regionFrom[j];
612    int k=indexScatter[j];
613    regionTo[k]=value;
614  }
615}
616void 
617CoinAbcInverseSqrts(double * array, int nIn)
618{
619  int n=nIn;
620     // used printf("o\n");
621#ifndef INTEL_COMPILER
622#pragma simd
623#endif
624#pragma cilk_grainsize=CILK_FOR_GRAINSIZE
625  cilk_for (int i = 0; i < n; i++)
626    array[i] = 1.0 / sqrt(array[i]);
627}
628void 
629CoinAbcReciprocal(double * array, int nIn, const double *input)
630{
631  int n=nIn;
632     // used printf("p\n");
633#ifndef INTEL_COMPILER
634#pragma simd
635#endif
636#pragma cilk_grainsize=CILK_FOR_GRAINSIZE
637  cilk_for (int i = 0; i < n; i++)
638    array[i] = 1.0 / input[i];
639}
640void 
641CoinAbcMemcpyLong(double * array,const double * arrayFrom,int size)
642{
643  memcpy(array,arrayFrom,size*sizeof(double));
644}
645void 
646CoinAbcMemcpyLong(int * array,const int * arrayFrom,int size)
647{
648  memcpy(array,arrayFrom,size*sizeof(int));
649}
650void 
651CoinAbcMemcpyLong(unsigned char * array,const unsigned char * arrayFrom,int size)
652{
653  memcpy(array,arrayFrom,size*sizeof(unsigned char));
654}
655void 
656CoinAbcMemset0Long(double * array,int size)
657{
658  memset(array,0,size*sizeof(double));
659}
660void 
661CoinAbcMemset0Long(int * array,int size)
662{
663  memset(array,0,size*sizeof(int));
664}
665void 
666CoinAbcMemset0Long(unsigned char * array,int size)
667{
668  memset(array,0,size*sizeof(unsigned char));
669}
670void 
671CoinAbcMemmove(double * array,const double * arrayFrom,int size)
672{
673  memmove(array,arrayFrom,size*sizeof(double));
674}
675void CoinAbcMemmove(int * array,const int * arrayFrom,int size)
676{
677  memmove(array,arrayFrom,size*sizeof(int));
678}
679void CoinAbcMemmove(unsigned char * array,const unsigned char * arrayFrom,int size)
680{
681  memmove(array,arrayFrom,size*sizeof(unsigned char));
682}
683// This moves down and zeroes out end
684void CoinAbcMemmoveAndZero(double * array,double * arrayFrom,int size)
685{
686  assert(arrayFrom-array>=0);
687  memmove(array,arrayFrom,size*sizeof(double));
688  double * endArray = array+size;
689  double * endArrayFrom = arrayFrom+size;
690  double * startZero = CoinMax(arrayFrom,endArray);
691  memset(startZero,0,(endArrayFrom-startZero)*sizeof(double));
692}
693// This compacts several sections and zeroes out end
694int CoinAbcCompact(int numberSections,int alreadyDone,double * array,const int * starts, const int * lengths)
695{
696  int totalLength=alreadyDone;
697  for (int i=0;i<numberSections;i++) {
698    totalLength += lengths[i];
699  }
700  for (int i=0;i<numberSections;i++) {
701    int start=starts[i];
702    int length=lengths[i];
703    int end=start+length;
704    memmove(array+alreadyDone,array+start,length*sizeof(double));
705    if (end>totalLength) {
706      start=CoinMax(start,totalLength);
707      memset(array+start,0,(end-start)*sizeof(double));
708    }
709    alreadyDone += length;
710  }
711  return totalLength;
712} 
713// This compacts several sections
714int CoinAbcCompact(int numberSections,int alreadyDone,int * array,const int * starts, const int * lengths)
715{
716  for (int i=0;i<numberSections;i++) {
717    memmove(array+alreadyDone,array+starts[i],lengths[i]*sizeof(int));
718    alreadyDone += lengths[i];
719  }
720  return alreadyDone;
721}
722void CoinAbcScatterUpdate0(int number,CoinFactorizationDouble /*pivotValue*/,
723                           const CoinFactorizationDouble *  COIN_RESTRICT /*thisElement*/,
724                           const int *  COIN_RESTRICT /*thisIndex*/,
725                           CoinFactorizationDouble *  COIN_RESTRICT /*region*/)
726{
727  assert (number==0);
728}
729void CoinAbcScatterUpdate1(int number,CoinFactorizationDouble pivotValue,
730                          const CoinFactorizationDouble *  COIN_RESTRICT thisElement,
731                          const int *  COIN_RESTRICT thisIndex,
732                          CoinFactorizationDouble * COIN_RESTRICT region)
733{
734  assert (number==1);
735  int iRow0 = thisIndex[0];
736  CoinFactorizationDouble regionValue0 = region[iRow0];
737  region[iRow0] = regionValue0 - thisElement[0] * pivotValue;
738}
739void CoinAbcScatterUpdate2(int number,CoinFactorizationDouble pivotValue,
740                          const CoinFactorizationDouble *  COIN_RESTRICT thisElement,
741                          const int *  COIN_RESTRICT thisIndex,
742                          CoinFactorizationDouble * COIN_RESTRICT region)
743{
744  assert (number==2);
745  int iRow0 = thisIndex[0];
746  int iRow1 = thisIndex[1];
747  CoinFactorizationDouble regionValue0 = region[iRow0];
748  CoinFactorizationDouble regionValue1 = region[iRow1];
749  region[iRow0] = regionValue0 - thisElement[0] * pivotValue;
750  region[iRow1] = regionValue1 - thisElement[1] * pivotValue;
751}
752void CoinAbcScatterUpdate3(int number,CoinFactorizationDouble pivotValue,
753                          const CoinFactorizationDouble *  COIN_RESTRICT thisElement,
754                          const int *  COIN_RESTRICT thisIndex,
755                          CoinFactorizationDouble * COIN_RESTRICT region)
756{
757  assert (number==3);
758  int iRow0 = thisIndex[0];
759  int iRow1 = thisIndex[1];
760  CoinFactorizationDouble regionValue0 = region[iRow0];
761  CoinFactorizationDouble regionValue1 = region[iRow1];
762  region[iRow0] = regionValue0 - thisElement[0] * pivotValue;
763  region[iRow1] = regionValue1 - thisElement[1] * pivotValue;
764  iRow0 = thisIndex[2];
765  regionValue0 = region[iRow0];
766  region[iRow0] = regionValue0 - thisElement[2] * pivotValue;
767}
768void CoinAbcScatterUpdate4(int number,CoinFactorizationDouble pivotValue,
769                          const CoinFactorizationDouble *  COIN_RESTRICT thisElement,
770                          const int *  COIN_RESTRICT thisIndex,
771                          CoinFactorizationDouble * COIN_RESTRICT region)
772{
773  assert (number==4);
774  int iRow0 = thisIndex[0];
775  int iRow1 = thisIndex[1];
776  CoinFactorizationDouble regionValue0 = region[iRow0];
777  CoinFactorizationDouble regionValue1 = region[iRow1];
778  region[iRow0] = regionValue0 - thisElement[0] * pivotValue;
779  region[iRow1] = regionValue1 - thisElement[1] * pivotValue;
780  iRow0 = thisIndex[2];
781  iRow1 = thisIndex[3];
782  regionValue0 = region[iRow0];
783  regionValue1 = region[iRow1];
784  region[iRow0] = regionValue0 - thisElement[2] * pivotValue;
785  region[iRow1] = regionValue1 - thisElement[3] * pivotValue;
786}
787void CoinAbcScatterUpdate5(int number,CoinFactorizationDouble pivotValue,
788                          const CoinFactorizationDouble *  COIN_RESTRICT thisElement,
789                          const int *  COIN_RESTRICT thisIndex,
790                          CoinFactorizationDouble * COIN_RESTRICT region)
791{
792  assert (number==5);
793  int iRow0 = thisIndex[0];
794  int iRow1 = thisIndex[1];
795  CoinFactorizationDouble regionValue0 = region[iRow0];
796  CoinFactorizationDouble regionValue1 = region[iRow1];
797  region[iRow0] = regionValue0 - thisElement[0] * pivotValue;
798  region[iRow1] = regionValue1 - thisElement[1] * pivotValue;
799  iRow0 = thisIndex[2];
800  iRow1 = thisIndex[3];
801  regionValue0 = region[iRow0];
802  regionValue1 = region[iRow1];
803  region[iRow0] = regionValue0 - thisElement[2] * pivotValue;
804  region[iRow1] = regionValue1 - thisElement[3] * pivotValue;
805  iRow0 = thisIndex[4];
806  regionValue0 = region[iRow0];
807  region[iRow0] = regionValue0 - thisElement[4] * pivotValue;
808}
809void CoinAbcScatterUpdate6(int number,CoinFactorizationDouble pivotValue,
810                          const CoinFactorizationDouble *  COIN_RESTRICT thisElement,
811                          const int *  COIN_RESTRICT thisIndex,
812                          CoinFactorizationDouble * COIN_RESTRICT region)
813{
814  assert (number==6);
815  int iRow0 = thisIndex[0];
816  int iRow1 = thisIndex[1];
817  CoinFactorizationDouble regionValue0 = region[iRow0];
818  CoinFactorizationDouble regionValue1 = region[iRow1];
819  region[iRow0] = regionValue0 - thisElement[0] * pivotValue;
820  region[iRow1] = regionValue1 - thisElement[1] * pivotValue;
821  iRow0 = thisIndex[2];
822  iRow1 = thisIndex[3];
823  regionValue0 = region[iRow0];
824  regionValue1 = region[iRow1];
825  region[iRow0] = regionValue0 - thisElement[2] * pivotValue;
826  region[iRow1] = regionValue1 - thisElement[3] * pivotValue;
827  iRow0 = thisIndex[4];
828  iRow1 = thisIndex[5];
829  regionValue0 = region[iRow0];
830  regionValue1 = region[iRow1];
831  region[iRow0] = regionValue0 - thisElement[4] * pivotValue;
832  region[iRow1] = regionValue1 - thisElement[5] * pivotValue;
833}
834void CoinAbcScatterUpdate7(int number,CoinFactorizationDouble pivotValue,
835                          const CoinFactorizationDouble *  COIN_RESTRICT thisElement,
836                          const int *  COIN_RESTRICT thisIndex,
837                          CoinFactorizationDouble * COIN_RESTRICT region)
838{
839  assert (number==7);
840  int iRow0 = thisIndex[0];
841  int iRow1 = thisIndex[1];
842  CoinFactorizationDouble regionValue0 = region[iRow0];
843  CoinFactorizationDouble regionValue1 = region[iRow1];
844  region[iRow0] = regionValue0 - thisElement[0] * pivotValue;
845  region[iRow1] = regionValue1 - thisElement[1] * pivotValue;
846  iRow0 = thisIndex[2];
847  iRow1 = thisIndex[3];
848  regionValue0 = region[iRow0];
849  regionValue1 = region[iRow1];
850  region[iRow0] = regionValue0 - thisElement[2] * pivotValue;
851  region[iRow1] = regionValue1 - thisElement[3] * pivotValue;
852  iRow0 = thisIndex[4];
853  iRow1 = thisIndex[5];
854  regionValue0 = region[iRow0];
855  regionValue1 = region[iRow1];
856  region[iRow0] = regionValue0 - thisElement[4] * pivotValue;
857  region[iRow1] = regionValue1 - thisElement[5] * pivotValue;
858  iRow0 = thisIndex[6];
859  regionValue0 = region[iRow0];
860  region[iRow0] = regionValue0 - thisElement[6] * pivotValue;
861}
862void CoinAbcScatterUpdate8(int number,CoinFactorizationDouble pivotValue,
863                          const CoinFactorizationDouble *  COIN_RESTRICT thisElement,
864                          const int *  COIN_RESTRICT thisIndex,
865                          CoinFactorizationDouble * COIN_RESTRICT region)
866{
867  assert (number==8);
868  int iRow0 = thisIndex[0];
869  int iRow1 = thisIndex[1];
870  CoinFactorizationDouble regionValue0 = region[iRow0];
871  CoinFactorizationDouble regionValue1 = region[iRow1];
872  region[iRow0] = regionValue0 - thisElement[0] * pivotValue;
873  region[iRow1] = regionValue1 - thisElement[1] * pivotValue;
874  iRow0 = thisIndex[2];
875  iRow1 = thisIndex[3];
876  regionValue0 = region[iRow0];
877  regionValue1 = region[iRow1];
878  region[iRow0] = regionValue0 - thisElement[2] * pivotValue;
879  region[iRow1] = regionValue1 - thisElement[3] * pivotValue;
880  iRow0 = thisIndex[4];
881  iRow1 = thisIndex[5];
882  regionValue0 = region[iRow0];
883  regionValue1 = region[iRow1];
884  region[iRow0] = regionValue0 - thisElement[4] * pivotValue;
885  region[iRow1] = regionValue1 - thisElement[5] * pivotValue;
886  iRow0 = thisIndex[6];
887  iRow1 = thisIndex[7];
888  regionValue0 = region[iRow0];
889  regionValue1 = region[iRow1];
890  region[iRow0] = regionValue0 - thisElement[6] * pivotValue;
891  region[iRow1] = regionValue1 - thisElement[7] * pivotValue;
892}
893void CoinAbcScatterUpdate4N(int number,CoinFactorizationDouble pivotValue,
894                          const CoinFactorizationDouble *  COIN_RESTRICT thisElement,
895                          const int *  COIN_RESTRICT thisIndex,
896                          CoinFactorizationDouble * COIN_RESTRICT region)
897{
898  assert ((number&3)==0);
899  for (CoinBigIndex j=number-1 ; j >=0; j-=4 ) {
900    CoinSimplexInt iRow0 = thisIndex[j];
901    CoinSimplexInt iRow1 = thisIndex[j-1];
902    CoinFactorizationDouble regionValue0 = region[iRow0];
903    CoinFactorizationDouble regionValue1 = region[iRow1];
904    region[iRow0] = regionValue0 - thisElement[j] * pivotValue;
905    region[iRow1] = regionValue1 - thisElement[j-1] * pivotValue;
906    iRow0 = thisIndex[j-2];
907    iRow1 = thisIndex[j-3];
908    regionValue0 = region[iRow0];
909    regionValue1 = region[iRow1];
910    region[iRow0] = regionValue0 - thisElement[j-2] * pivotValue;
911    region[iRow1] = regionValue1 - thisElement[j-3] * pivotValue;
912  }
913}
914void CoinAbcScatterUpdate4NPlus1(int number,CoinFactorizationDouble pivotValue,
915                          const CoinFactorizationDouble *  COIN_RESTRICT thisElement,
916                          const int *  COIN_RESTRICT thisIndex,
917                          CoinFactorizationDouble * COIN_RESTRICT region)
918{
919  assert ((number&3)==1);
920  int iRow0 = thisIndex[number-1];
921  CoinFactorizationDouble regionValue0 = region[iRow0];
922  region[iRow0] = regionValue0 - thisElement[number-1] * pivotValue;
923  number -= 1;
924  for (CoinBigIndex j=number-1 ; j >=0; j-=4 ) {
925    CoinSimplexInt iRow0 = thisIndex[j];
926    CoinSimplexInt iRow1 = thisIndex[j-1];
927    CoinFactorizationDouble regionValue0 = region[iRow0];
928    CoinFactorizationDouble regionValue1 = region[iRow1];
929    region[iRow0] = regionValue0 - thisElement[j] * pivotValue;
930    region[iRow1] = regionValue1 - thisElement[j-1] * pivotValue;
931    iRow0 = thisIndex[j-2];
932    iRow1 = thisIndex[j-3];
933    regionValue0 = region[iRow0];
934    regionValue1 = region[iRow1];
935    region[iRow0] = regionValue0 - thisElement[j-2] * pivotValue;
936    region[iRow1] = regionValue1 - thisElement[j-3] * pivotValue;
937  }
938}
939void CoinAbcScatterUpdate4NPlus2(int number,CoinFactorizationDouble pivotValue,
940                          const CoinFactorizationDouble *  COIN_RESTRICT thisElement,
941                          const int *  COIN_RESTRICT thisIndex,
942                          CoinFactorizationDouble * COIN_RESTRICT region)
943{
944  assert ((number&3)==2);
945  int iRow0 = thisIndex[number-1];
946  int iRow1 = thisIndex[number-2];
947  CoinFactorizationDouble regionValue0 = region[iRow0];
948  CoinFactorizationDouble regionValue1 = region[iRow1];
949  region[iRow0] = regionValue0 - thisElement[number-1] * pivotValue;
950  region[iRow1] = regionValue1 - thisElement[number-2] * pivotValue;
951  number -= 2;
952  for (CoinBigIndex j=number-1 ; j >=0; j-=4 ) {
953    CoinSimplexInt iRow0 = thisIndex[j];
954    CoinSimplexInt iRow1 = thisIndex[j-1];
955    CoinFactorizationDouble regionValue0 = region[iRow0];
956    CoinFactorizationDouble regionValue1 = region[iRow1];
957    region[iRow0] = regionValue0 - thisElement[j] * pivotValue;
958    region[iRow1] = regionValue1 - thisElement[j-1] * pivotValue;
959    iRow0 = thisIndex[j-2];
960    iRow1 = thisIndex[j-3];
961    regionValue0 = region[iRow0];
962    regionValue1 = region[iRow1];
963    region[iRow0] = regionValue0 - thisElement[j-2] * pivotValue;
964    region[iRow1] = regionValue1 - thisElement[j-3] * pivotValue;
965  }
966}
967void CoinAbcScatterUpdate4NPlus3(int number,CoinFactorizationDouble pivotValue,
968                          const CoinFactorizationDouble *  COIN_RESTRICT thisElement,
969                          const int *  COIN_RESTRICT thisIndex,
970                          CoinFactorizationDouble * COIN_RESTRICT region)
971{
972  assert ((number&3)==3);
973  int iRow0 = thisIndex[number-1];
974  int iRow1 = thisIndex[number-2];
975  CoinFactorizationDouble regionValue0 = region[iRow0];
976  CoinFactorizationDouble regionValue1 = region[iRow1];
977  region[iRow0] = regionValue0 - thisElement[number-1] * pivotValue;
978  region[iRow1] = regionValue1 - thisElement[number-2] * pivotValue;
979  iRow0 = thisIndex[number-3];
980  regionValue0 = region[iRow0];
981  region[iRow0] = regionValue0 - thisElement[number-3] * pivotValue;
982  number -= 3;
983  for (CoinBigIndex j=number-1 ; j >=0; j-=4 ) {
984    CoinSimplexInt iRow0 = thisIndex[j];
985    CoinSimplexInt iRow1 = thisIndex[j-1];
986    CoinFactorizationDouble regionValue0 = region[iRow0];
987    CoinFactorizationDouble regionValue1 = region[iRow1];
988    region[iRow0] = regionValue0 - thisElement[j] * pivotValue;
989    region[iRow1] = regionValue1 - thisElement[j-1] * pivotValue;
990    iRow0 = thisIndex[j-2];
991    iRow1 = thisIndex[j-3];
992    regionValue0 = region[iRow0];
993    regionValue1 = region[iRow1];
994    region[iRow0] = regionValue0 - thisElement[j-2] * pivotValue;
995    region[iRow1] = regionValue1 - thisElement[j-3] * pivotValue;
996  }
997}
998void CoinAbcScatterUpdate0(int numberIn, CoinFactorizationDouble /*multiplier*/,
999                           const CoinFactorizationDouble *  COIN_RESTRICT element,
1000                           CoinFactorizationDouble *  COIN_RESTRICT /*region*/)
1001{
1002#ifndef NDEBUG
1003  assert (numberIn==0);
1004#endif
1005}
1006// start alternatives
1007#if 0
1008void CoinAbcScatterUpdate1(int numberIn, CoinFactorizationDouble multiplier,
1009                          const CoinFactorizationDouble *  COIN_RESTRICT element,
1010                          CoinFactorizationDouble * COIN_RESTRICT region)
1011{
1012#ifndef NDEBUG
1013  assert (numberIn==1);
1014#endif
1015  const int * COIN_RESTRICT thisColumn = reinterpret_cast<const int *>(element);
1016  element++;
1017  int iColumn0=thisColumn[0];
1018  CoinFactorizationDouble value0=region[iColumn0];
1019  value0 += multiplier*element[0];
1020  region[iColumn0]=value0;
1021}
1022void CoinAbcScatterUpdate2(int numberIn, CoinFactorizationDouble multiplier,
1023                          const CoinFactorizationDouble *  COIN_RESTRICT element,
1024                          CoinFactorizationDouble * COIN_RESTRICT region)
1025{
1026#ifndef NDEBUG
1027  assert (numberIn==2);
1028#endif
1029  const int * COIN_RESTRICT thisColumn = reinterpret_cast<const int *>(element);
1030#if NEW_CHUNK_SIZE==2
1031  int nFull=2&(~(NEW_CHUNK_SIZE-1));
1032  for (int j=0;j<nFull;j+=NEW_CHUNK_SIZE) {
1033    coin_prefetch(element+NEW_CHUNK_SIZE_INCREMENT);
1034    int iColumn0=thisColumn[0];
1035    int iColumn1=thisColumn[1];
1036    CoinFactorizationDouble value0=region[iColumn0];
1037    CoinFactorizationDouble value1=region[iColumn1];
1038    value0 += multiplier*element[0+NEW_CHUNK_SIZE_OFFSET];
1039    value1 += multiplier*element[1+NEW_CHUNK_SIZE_OFFSET];
1040    region[iColumn0]=value0;
1041    region[iColumn1]=value1;
1042    element+=NEW_CHUNK_SIZE_INCREMENT;
1043    thisColumn = reinterpret_cast<const int *>(element);
1044  }
1045#endif
1046#if NEW_CHUNK_SIZE==4
1047  element++;
1048  int iColumn0=thisColumn[0];
1049  int iColumn1=thisColumn[1];
1050  CoinFactorizationDouble value0=region[iColumn0];
1051  CoinFactorizationDouble value1=region[iColumn1];
1052  value0 += multiplier*element[0];
1053  value1 += multiplier*element[1];
1054  region[iColumn0]=value0;
1055  region[iColumn1]=value1;
1056#endif
1057}
1058void CoinAbcScatterUpdate3(int numberIn, CoinFactorizationDouble multiplier,
1059                          const CoinFactorizationDouble *  COIN_RESTRICT element,
1060                          CoinFactorizationDouble * COIN_RESTRICT region)
1061{
1062#ifndef NDEBUG
1063  assert (numberIn==3);
1064#endif
1065  const int * COIN_RESTRICT thisColumn = reinterpret_cast<const int *>(element);
1066#if NEW_CHUNK_SIZE==2
1067  int nFull=3&(~(NEW_CHUNK_SIZE-1));
1068  for (int j=0;j<nFull;j+=NEW_CHUNK_SIZE) {
1069    coin_prefetch(element+NEW_CHUNK_SIZE_INCREMENT);
1070    int iColumn0=thisColumn[0];
1071    int iColumn1=thisColumn[1];
1072    CoinFactorizationDouble value0=region[iColumn0];
1073    CoinFactorizationDouble value1=region[iColumn1];
1074    value0 += multiplier*element[0+NEW_CHUNK_SIZE_OFFSET];
1075    value1 += multiplier*element[1+NEW_CHUNK_SIZE_OFFSET];
1076    region[iColumn0]=value0;
1077    region[iColumn1]=value1;
1078    element+=NEW_CHUNK_SIZE_INCREMENT;
1079    thisColumn = reinterpret_cast<const int *>(element);
1080  }
1081#endif
1082#if NEW_CHUNK_SIZE==2
1083  element++;
1084  int iColumn0=thisColumn[0];
1085  CoinFactorizationDouble value0=region[iColumn0];
1086  value0 += multiplier*element[0];
1087  region[iColumn0]=value0;
1088#else
1089  element+=2;
1090  int iColumn0=thisColumn[0];
1091  int iColumn1=thisColumn[1];
1092  int iColumn2=thisColumn[2];
1093  CoinFactorizationDouble value0=region[iColumn0];
1094  CoinFactorizationDouble value1=region[iColumn1];
1095  CoinFactorizationDouble value2=region[iColumn2];
1096  value0 += multiplier*element[0];
1097  value1 += multiplier*element[1];
1098  value2 += multiplier*element[2];
1099  region[iColumn0]=value0;
1100  region[iColumn1]=value1;
1101  region[iColumn2]=value2;
1102#endif
1103}
1104void CoinAbcScatterUpdate4(int numberIn, CoinFactorizationDouble multiplier,
1105                          const CoinFactorizationDouble *  COIN_RESTRICT element,
1106                          CoinFactorizationDouble * COIN_RESTRICT region)
1107{
1108#ifndef NDEBUG
1109  assert (numberIn==4);
1110#endif
1111  const int * COIN_RESTRICT thisColumn = reinterpret_cast<const int *>(element);
1112  int nFull=4&(~(NEW_CHUNK_SIZE-1));
1113  for (int j=0;j<nFull;j+=NEW_CHUNK_SIZE) {
1114    //coin_prefetch(element+NEW_CHUNK_SIZE_INCREMENT);
1115#if NEW_CHUNK_SIZE==2
1116    int iColumn0=thisColumn[0];
1117    int iColumn1=thisColumn[1];
1118    CoinFactorizationDouble value0=region[iColumn0];
1119    CoinFactorizationDouble value1=region[iColumn1];
1120    value0 += multiplier*element[0+NEW_CHUNK_SIZE_OFFSET];
1121    value1 += multiplier*element[1+NEW_CHUNK_SIZE_OFFSET];
1122    region[iColumn0]=value0;
1123    region[iColumn1]=value1;
1124#elif NEW_CHUNK_SIZE==4
1125    int iColumn0=thisColumn[0];
1126    int iColumn1=thisColumn[1];
1127    int iColumn2=thisColumn[2];
1128    int iColumn3=thisColumn[3];
1129    CoinFactorizationDouble value0=region[iColumn0];
1130    CoinFactorizationDouble value1=region[iColumn1];
1131    CoinFactorizationDouble value2=region[iColumn2];
1132    CoinFactorizationDouble value3=region[iColumn3];
1133    value0 += multiplier*element[0+NEW_CHUNK_SIZE_OFFSET];
1134    value1 += multiplier*element[1+NEW_CHUNK_SIZE_OFFSET];
1135    value2 += multiplier*element[2+NEW_CHUNK_SIZE_OFFSET];
1136    value3 += multiplier*element[3+NEW_CHUNK_SIZE_OFFSET];
1137    region[iColumn0]=value0;
1138    region[iColumn1]=value1;
1139    region[iColumn2]=value2;
1140    region[iColumn3]=value3;
1141#else
1142    abort();
1143#endif
1144    element+=NEW_CHUNK_SIZE_INCREMENT;
1145    thisColumn = reinterpret_cast<const int *>(element);
1146  }
1147}
1148void CoinAbcScatterUpdate5(int numberIn, CoinFactorizationDouble multiplier,
1149                          const CoinFactorizationDouble *  COIN_RESTRICT element,
1150                          CoinFactorizationDouble * COIN_RESTRICT region)
1151{
1152#ifndef NDEBUG
1153  assert (numberIn==5);
1154#endif
1155  const int * COIN_RESTRICT thisColumn = reinterpret_cast<const int *>(element);
1156  int nFull=5&(~(NEW_CHUNK_SIZE-1));
1157  for (int j=0;j<nFull;j+=NEW_CHUNK_SIZE) {
1158    coin_prefetch_const(element+6);
1159#if NEW_CHUNK_SIZE==2
1160    int iColumn0=thisColumn[0];
1161    int iColumn1=thisColumn[1];
1162    CoinFactorizationDouble value0=region[iColumn0];
1163    CoinFactorizationDouble value1=region[iColumn1];
1164    value0 += multiplier*element[0+NEW_CHUNK_SIZE_OFFSET];
1165    value1 += multiplier*element[1+NEW_CHUNK_SIZE_OFFSET];
1166    region[iColumn0]=value0;
1167    region[iColumn1]=value1;
1168#elif NEW_CHUNK_SIZE==4
1169    int iColumn0=thisColumn[0];
1170    int iColumn1=thisColumn[1];
1171    int iColumn2=thisColumn[2];
1172    int iColumn3=thisColumn[3];
1173    CoinFactorizationDouble value0=region[iColumn0];
1174    CoinFactorizationDouble value1=region[iColumn1];
1175    CoinFactorizationDouble value2=region[iColumn2];
1176    CoinFactorizationDouble value3=region[iColumn3];
1177    value0 += multiplier*element[0+NEW_CHUNK_SIZE_OFFSET];
1178    value1 += multiplier*element[1+NEW_CHUNK_SIZE_OFFSET];
1179    value2 += multiplier*element[2+NEW_CHUNK_SIZE_OFFSET];
1180    value3 += multiplier*element[3+NEW_CHUNK_SIZE_OFFSET];
1181    region[iColumn0]=value0;
1182    region[iColumn1]=value1;
1183    region[iColumn2]=value2;
1184    region[iColumn3]=value3;
1185#else
1186    abort();
1187#endif
1188    element+=NEW_CHUNK_SIZE_INCREMENT;
1189    thisColumn = reinterpret_cast<const int *>(element);
1190  }
1191  element++;
1192  int iColumn0=thisColumn[0];
1193  CoinFactorizationDouble value0=region[iColumn0];
1194  value0 += multiplier*element[0];
1195  region[iColumn0]=value0;
1196}
1197void CoinAbcScatterUpdate6(int numberIn, CoinFactorizationDouble multiplier,
1198                          const CoinFactorizationDouble *  COIN_RESTRICT element,
1199                          CoinFactorizationDouble * COIN_RESTRICT region)
1200{
1201#ifndef NDEBUG
1202  assert (numberIn==6);
1203#endif
1204  const int * COIN_RESTRICT thisColumn = reinterpret_cast<const int *>(element);
1205  int nFull=6&(~(NEW_CHUNK_SIZE-1));
1206  for (int j=0;j<nFull;j+=NEW_CHUNK_SIZE) {
1207    coin_prefetch_const(element+6);
1208#if NEW_CHUNK_SIZE==2
1209    int iColumn0=thisColumn[0];
1210    int iColumn1=thisColumn[1];
1211    CoinFactorizationDouble value0=region[iColumn0];
1212    CoinFactorizationDouble value1=region[iColumn1];
1213    value0 += multiplier*element[0+NEW_CHUNK_SIZE_OFFSET];
1214    value1 += multiplier*element[1+NEW_CHUNK_SIZE_OFFSET];
1215    region[iColumn0]=value0;
1216    region[iColumn1]=value1;
1217#elif NEW_CHUNK_SIZE==4
1218    int iColumn0=thisColumn[0];
1219    int iColumn1=thisColumn[1];
1220    int iColumn2=thisColumn[2];
1221    int iColumn3=thisColumn[3];
1222    CoinFactorizationDouble value0=region[iColumn0];
1223    CoinFactorizationDouble value1=region[iColumn1];
1224    CoinFactorizationDouble value2=region[iColumn2];
1225    CoinFactorizationDouble value3=region[iColumn3];
1226    value0 += multiplier*element[0+NEW_CHUNK_SIZE_OFFSET];
1227    value1 += multiplier*element[1+NEW_CHUNK_SIZE_OFFSET];
1228    value2 += multiplier*element[2+NEW_CHUNK_SIZE_OFFSET];
1229    value3 += multiplier*element[3+NEW_CHUNK_SIZE_OFFSET];
1230    region[iColumn0]=value0;
1231    region[iColumn1]=value1;
1232    region[iColumn2]=value2;
1233    region[iColumn3]=value3;
1234#else
1235    abort();
1236#endif
1237    element+=NEW_CHUNK_SIZE_INCREMENT;
1238    thisColumn = reinterpret_cast<const int *>(element);
1239  }
1240#if NEW_CHUNK_SIZE==4
1241  element++;
1242  int iColumn0=thisColumn[0];
1243  int iColumn1=thisColumn[1];
1244  CoinFactorizationDouble value0=region[iColumn0];
1245  CoinFactorizationDouble value1=region[iColumn1];
1246  value0 += multiplier*element[0];
1247  value1 += multiplier*element[1];
1248  region[iColumn0]=value0;
1249  region[iColumn1]=value1;
1250#endif
1251}
1252void CoinAbcScatterUpdate7(int numberIn, CoinFactorizationDouble multiplier,
1253                          const CoinFactorizationDouble *  COIN_RESTRICT element,
1254                          CoinFactorizationDouble * COIN_RESTRICT region)
1255{
1256#ifndef NDEBUG
1257  assert (numberIn==7);
1258#endif
1259  const int * COIN_RESTRICT thisColumn = reinterpret_cast<const int *>(element);
1260  int nFull=7&(~(NEW_CHUNK_SIZE-1));
1261  for (int j=0;j<nFull;j+=NEW_CHUNK_SIZE) {
1262    coin_prefetch_const(element+6);
1263#if NEW_CHUNK_SIZE==2
1264    int iColumn0=thisColumn[0];
1265    int iColumn1=thisColumn[1];
1266    CoinFactorizationDouble value0=region[iColumn0];
1267    CoinFactorizationDouble value1=region[iColumn1];
1268    value0 += multiplier*element[0+NEW_CHUNK_SIZE_OFFSET];
1269    value1 += multiplier*element[1+NEW_CHUNK_SIZE_OFFSET];
1270    region[iColumn0]=value0;
1271    region[iColumn1]=value1;
1272#elif NEW_CHUNK_SIZE==4
1273    int iColumn0=thisColumn[0];
1274    int iColumn1=thisColumn[1];
1275    int iColumn2=thisColumn[2];
1276    int iColumn3=thisColumn[3];
1277    CoinFactorizationDouble value0=region[iColumn0];
1278    CoinFactorizationDouble value1=region[iColumn1];
1279    CoinFactorizationDouble value2=region[iColumn2];
1280    CoinFactorizationDouble value3=region[iColumn3];
1281    value0 += multiplier*element[0+NEW_CHUNK_SIZE_OFFSET];
1282    value1 += multiplier*element[1+NEW_CHUNK_SIZE_OFFSET];
1283    value2 += multiplier*element[2+NEW_CHUNK_SIZE_OFFSET];
1284    value3 += multiplier*element[3+NEW_CHUNK_SIZE_OFFSET];
1285    region[iColumn0]=value0;
1286    region[iColumn1]=value1;
1287    region[iColumn2]=value2;
1288    region[iColumn3]=value3;
1289#else
1290    abort();
1291#endif
1292    element+=NEW_CHUNK_SIZE_INCREMENT;
1293    thisColumn = reinterpret_cast<const int *>(element);
1294  }
1295#if NEW_CHUNK_SIZE==2
1296  element++;
1297  int iColumn0=thisColumn[0];
1298  CoinFactorizationDouble value0=region[iColumn0];
1299  value0 += multiplier*element[0];
1300  region[iColumn0]=value0;
1301#else
1302  element+=2;
1303  int iColumn0=thisColumn[0];
1304  int iColumn1=thisColumn[1];
1305  int iColumn2=thisColumn[2];
1306  CoinFactorizationDouble value0=region[iColumn0];
1307  CoinFactorizationDouble value1=region[iColumn1];
1308  CoinFactorizationDouble value2=region[iColumn2];
1309  value0 += multiplier*element[0];
1310  value1 += multiplier*element[1];
1311  value2 += multiplier*element[2];
1312  region[iColumn0]=value0;
1313  region[iColumn1]=value1;
1314  region[iColumn2]=value2;
1315#endif
1316}
1317void CoinAbcScatterUpdate8(int numberIn, CoinFactorizationDouble multiplier,
1318                          const CoinFactorizationDouble *  COIN_RESTRICT element,
1319                          CoinFactorizationDouble * COIN_RESTRICT region)
1320{
1321#ifndef NDEBUG
1322  assert (numberIn==8);
1323#endif
1324  const int * COIN_RESTRICT thisColumn = reinterpret_cast<const int *>(element);
1325  int nFull=8&(~(NEW_CHUNK_SIZE-1));
1326  for (int j=0;j<nFull;j+=NEW_CHUNK_SIZE) {
1327    coin_prefetch_const(element+6);
1328#if NEW_CHUNK_SIZE==2
1329    int iColumn0=thisColumn[0];
1330    int iColumn1=thisColumn[1];
1331    CoinFactorizationDouble value0=region[iColumn0];
1332    CoinFactorizationDouble value1=region[iColumn1];
1333    value0 += multiplier*element[0+NEW_CHUNK_SIZE_OFFSET];
1334    value1 += multiplier*element[1+NEW_CHUNK_SIZE_OFFSET];
1335    region[iColumn0]=value0;
1336    region[iColumn1]=value1;
1337#elif NEW_CHUNK_SIZE==4
1338    int iColumn0=thisColumn[0];
1339    int iColumn1=thisColumn[1];
1340    int iColumn2=thisColumn[2];
1341    int iColumn3=thisColumn[3];
1342    CoinFactorizationDouble value0=region[iColumn0];
1343    CoinFactorizationDouble value1=region[iColumn1];
1344    CoinFactorizationDouble value2=region[iColumn2];
1345    CoinFactorizationDouble value3=region[iColumn3];
1346    value0 += multiplier*element[0+NEW_CHUNK_SIZE_OFFSET];
1347    value1 += multiplier*element[1+NEW_CHUNK_SIZE_OFFSET];
1348    value2 += multiplier*element[2+NEW_CHUNK_SIZE_OFFSET];
1349    value3 += multiplier*element[3+NEW_CHUNK_SIZE_OFFSET];
1350    region[iColumn0]=value0;
1351    region[iColumn1]=value1;
1352    region[iColumn2]=value2;
1353    region[iColumn3]=value3;
1354#else
1355    abort();
1356#endif
1357    element+=NEW_CHUNK_SIZE_INCREMENT;
1358    thisColumn = reinterpret_cast<const int *>(element);
1359  }
1360}
1361void CoinAbcScatterUpdate4N(int numberIn, CoinFactorizationDouble multiplier,
1362                                 const CoinFactorizationDouble *  COIN_RESTRICT element,
1363                                 CoinFactorizationDouble * COIN_RESTRICT region)
1364{
1365  assert ((numberIn&3)==0);
1366  const int * COIN_RESTRICT thisColumn = reinterpret_cast<const int *>(element);
1367  int nFull=numberIn&(~(NEW_CHUNK_SIZE-1));
1368  for (int j=0;j<nFull;j+=NEW_CHUNK_SIZE) {
1369    coin_prefetch_const(element+6);
1370#if NEW_CHUNK_SIZE==2
1371    int iColumn0=thisColumn[0];
1372    int iColumn1=thisColumn[1];
1373    CoinFactorizationDouble value0=region[iColumn0];
1374    CoinFactorizationDouble value1=region[iColumn1];
1375    value0 += multiplier*element[0+NEW_CHUNK_SIZE_OFFSET];
1376    value1 += multiplier*element[1+NEW_CHUNK_SIZE_OFFSET];
1377    region[iColumn0]=value0;
1378    region[iColumn1]=value1;
1379#elif NEW_CHUNK_SIZE==4
1380    int iColumn0=thisColumn[0];
1381    int iColumn1=thisColumn[1];
1382    int iColumn2=thisColumn[2];
1383    int iColumn3=thisColumn[3];
1384    CoinFactorizationDouble value0=region[iColumn0];
1385    CoinFactorizationDouble value1=region[iColumn1];
1386    CoinFactorizationDouble value2=region[iColumn2];
1387    CoinFactorizationDouble value3=region[iColumn3];
1388    value0 += multiplier*element[0+NEW_CHUNK_SIZE_OFFSET];
1389    value1 += multiplier*element[1+NEW_CHUNK_SIZE_OFFSET];
1390    value2 += multiplier*element[2+NEW_CHUNK_SIZE_OFFSET];
1391    value3 += multiplier*element[3+NEW_CHUNK_SIZE_OFFSET];
1392    region[iColumn0]=value0;
1393    region[iColumn1]=value1;
1394    region[iColumn2]=value2;
1395    region[iColumn3]=value3;
1396#else
1397    abort();
1398#endif
1399    element+=NEW_CHUNK_SIZE_INCREMENT;
1400    thisColumn = reinterpret_cast<const int *>(element);
1401  }
1402}
1403void CoinAbcScatterUpdate4NPlus1(int numberIn, CoinFactorizationDouble multiplier,
1404                                 const CoinFactorizationDouble *  COIN_RESTRICT element,
1405                                 CoinFactorizationDouble * COIN_RESTRICT region)
1406{
1407  assert ((numberIn&3)==1);
1408  const int * COIN_RESTRICT thisColumn = reinterpret_cast<const int *>(element);
1409  int nFull=numberIn&(~(NEW_CHUNK_SIZE-1));
1410  for (int j=0;j<nFull;j+=NEW_CHUNK_SIZE) {
1411    coin_prefetch_const(element+6);
1412#if NEW_CHUNK_SIZE==2
1413    int iColumn0=thisColumn[0];
1414    int iColumn1=thisColumn[1];
1415    CoinFactorizationDouble value0=region[iColumn0];
1416    CoinFactorizationDouble value1=region[iColumn1];
1417    value0 += multiplier*element[0+NEW_CHUNK_SIZE_OFFSET];
1418    value1 += multiplier*element[1+NEW_CHUNK_SIZE_OFFSET];
1419    region[iColumn0]=value0;
1420    region[iColumn1]=value1;
1421#elif NEW_CHUNK_SIZE==4
1422    int iColumn0=thisColumn[0];
1423    int iColumn1=thisColumn[1];
1424    int iColumn2=thisColumn[2];
1425    int iColumn3=thisColumn[3];
1426    CoinFactorizationDouble value0=region[iColumn0];
1427    CoinFactorizationDouble value1=region[iColumn1];
1428    CoinFactorizationDouble value2=region[iColumn2];
1429    CoinFactorizationDouble value3=region[iColumn3];
1430    value0 += multiplier*element[0+NEW_CHUNK_SIZE_OFFSET];
1431    value1 += multiplier*element[1+NEW_CHUNK_SIZE_OFFSET];
1432    value2 += multiplier*element[2+NEW_CHUNK_SIZE_OFFSET];
1433    value3 += multiplier*element[3+NEW_CHUNK_SIZE_OFFSET];
1434    region[iColumn0]=value0;
1435    region[iColumn1]=value1;
1436    region[iColumn2]=value2;
1437    region[iColumn3]=value3;
1438#else
1439    abort();
1440#endif
1441    element+=NEW_CHUNK_SIZE_INCREMENT;
1442    thisColumn = reinterpret_cast<const int *>(element);
1443  }
1444  element++;
1445  int iColumn0=thisColumn[0];
1446  CoinFactorizationDouble value0=region[iColumn0];
1447  value0 += multiplier*element[0];
1448  region[iColumn0]=value0;
1449}
1450void CoinAbcScatterUpdate4NPlus2(int numberIn, CoinFactorizationDouble multiplier,
1451                                 const CoinFactorizationDouble *  COIN_RESTRICT element,
1452                                 CoinFactorizationDouble * COIN_RESTRICT region)
1453{
1454  assert ((numberIn&3)==2);
1455  const int * COIN_RESTRICT thisColumn = reinterpret_cast<const int *>(element);
1456  int nFull=numberIn&(~(NEW_CHUNK_SIZE-1));
1457  for (int j=0;j<nFull;j+=NEW_CHUNK_SIZE) {
1458    coin_prefetch_const(element+6);
1459#if NEW_CHUNK_SIZE==2
1460    int iColumn0=thisColumn[0];
1461    int iColumn1=thisColumn[1];
1462    CoinFactorizationDouble value0=region[iColumn0];
1463    CoinFactorizationDouble value1=region[iColumn1];
1464    value0 += multiplier*element[0+NEW_CHUNK_SIZE_OFFSET];
1465    value1 += multiplier*element[1+NEW_CHUNK_SIZE_OFFSET];
1466    region[iColumn0]=value0;
1467    region[iColumn1]=value1;
1468#elif NEW_CHUNK_SIZE==4
1469    int iColumn0=thisColumn[0];
1470    int iColumn1=thisColumn[1];
1471    int iColumn2=thisColumn[2];
1472    int iColumn3=thisColumn[3];
1473    CoinFactorizationDouble value0=region[iColumn0];
1474    CoinFactorizationDouble value1=region[iColumn1];
1475    CoinFactorizationDouble value2=region[iColumn2];
1476    CoinFactorizationDouble value3=region[iColumn3];
1477    value0 += multiplier*element[0+NEW_CHUNK_SIZE_OFFSET];
1478    value1 += multiplier*element[1+NEW_CHUNK_SIZE_OFFSET];
1479    value2 += multiplier*element[2+NEW_CHUNK_SIZE_OFFSET];
1480    value3 += multiplier*element[3+NEW_CHUNK_SIZE_OFFSET];
1481    region[iColumn0]=value0;
1482    region[iColumn1]=value1;
1483    region[iColumn2]=value2;
1484    region[iColumn3]=value3;
1485#else
1486    abort();
1487#endif
1488    element+=NEW_CHUNK_SIZE_INCREMENT;
1489    thisColumn = reinterpret_cast<const int *>(element);
1490  }
1491#if NEW_CHUNK_SIZE==4
1492  element++;
1493  int iColumn0=thisColumn[0];
1494  int iColumn1=thisColumn[1];
1495  CoinFactorizationDouble value0=region[iColumn0];
1496  CoinFactorizationDouble value1=region[iColumn1];
1497  value0 += multiplier*element[0];
1498  value1 += multiplier*element[1];
1499  region[iColumn0]=value0;
1500  region[iColumn1]=value1;
1501#endif
1502}
1503void CoinAbcScatterUpdate4NPlus3(int numberIn, CoinFactorizationDouble multiplier,
1504                                 const CoinFactorizationDouble *  COIN_RESTRICT element,
1505                                 CoinFactorizationDouble * COIN_RESTRICT region)
1506{
1507  assert ((numberIn&3)==3);
1508  const int * COIN_RESTRICT thisColumn = reinterpret_cast<const int *>(element);
1509  int nFull=numberIn&(~(NEW_CHUNK_SIZE-1));
1510  for (int j=0;j<nFull;j+=NEW_CHUNK_SIZE) {
1511    coin_prefetch_const(element+6);
1512#if NEW_CHUNK_SIZE==2
1513    int iColumn0=thisColumn[0];
1514    int iColumn1=thisColumn[1];
1515    CoinFactorizationDouble value0=region[iColumn0];
1516    CoinFactorizationDouble value1=region[iColumn1];
1517    value0 += multiplier*element[0+NEW_CHUNK_SIZE_OFFSET];
1518    value1 += multiplier*element[1+NEW_CHUNK_SIZE_OFFSET];
1519    region[iColumn0]=value0;
1520    region[iColumn1]=value1;
1521#elif NEW_CHUNK_SIZE==4
1522    int iColumn0=thisColumn[0];
1523    int iColumn1=thisColumn[1];
1524    int iColumn2=thisColumn[2];
1525    int iColumn3=thisColumn[3];
1526    CoinFactorizationDouble value0=region[iColumn0];
1527    CoinFactorizationDouble value1=region[iColumn1];
1528    CoinFactorizationDouble value2=region[iColumn2];
1529    CoinFactorizationDouble value3=region[iColumn3];
1530    value0 += multiplier*element[0+NEW_CHUNK_SIZE_OFFSET];
1531    value1 += multiplier*element[1+NEW_CHUNK_SIZE_OFFSET];
1532    value2 += multiplier*element[2+NEW_CHUNK_SIZE_OFFSET];
1533    value3 += multiplier*element[3+NEW_CHUNK_SIZE_OFFSET];
1534    region[iColumn0]=value0;
1535    region[iColumn1]=value1;
1536    region[iColumn2]=value2;
1537    region[iColumn3]=value3;
1538#else
1539    abort();
1540#endif
1541    element+=NEW_CHUNK_SIZE_INCREMENT;
1542    thisColumn = reinterpret_cast<const int *>(element);
1543  }
1544#if NEW_CHUNK_SIZE==2
1545  element++;
1546  int iColumn0=thisColumn[0];
1547  CoinFactorizationDouble value0=region[iColumn0];
1548  value0 += multiplier*element[0];
1549  region[iColumn0]=value0;
1550#else
1551  element+=2;
1552  int iColumn0=thisColumn[0];
1553  int iColumn1=thisColumn[1];
1554  int iColumn2=thisColumn[2];
1555  CoinFactorizationDouble value0=region[iColumn0];
1556  CoinFactorizationDouble value1=region[iColumn1];
1557  CoinFactorizationDouble value2=region[iColumn2];
1558  value0 += multiplier*element[0];
1559  value1 += multiplier*element[1];
1560  value2 += multiplier*element[2];
1561  region[iColumn0]=value0;
1562  region[iColumn1]=value1;
1563  region[iColumn2]=value2;
1564#endif
1565}
1566#else
1567// alternative
1568#define CACHE_LINE_SIZE 16
1569#define OPERATION +=
1570#define functionName(zname) CoinAbc##zname
1571#define ABC_CREATE_SCATTER_FUNCTION 1
1572#include "CoinAbcHelperFunctions.hpp"
1573#undef OPERATION
1574#undef functionName
1575#define OPERATION -=
1576#define functionName(zname) CoinAbc##zname##Subtract
1577#include "CoinAbcHelperFunctions.hpp"
1578#undef OPERATION
1579#undef functionName
1580#define OPERATION +=
1581#define functionName(zname) CoinAbc##zname##Add
1582#include "CoinAbcHelperFunctions.hpp"
1583scatterUpdate AbcScatterLow[]={
1584  &CoinAbcScatterUpdate0,
1585  &CoinAbcScatterUpdate1,
1586  &CoinAbcScatterUpdate2,
1587  &CoinAbcScatterUpdate3,
1588  &CoinAbcScatterUpdate4,
1589  &CoinAbcScatterUpdate5,
1590  &CoinAbcScatterUpdate6,
1591  &CoinAbcScatterUpdate7,
1592  &CoinAbcScatterUpdate8};
1593scatterUpdate AbcScatterHigh[]={
1594  &CoinAbcScatterUpdate4N,
1595  &CoinAbcScatterUpdate4NPlus1,
1596  &CoinAbcScatterUpdate4NPlus2,
1597  &CoinAbcScatterUpdate4NPlus3};
1598scatterUpdate AbcScatterLowSubtract[]={
1599  &CoinAbcScatterUpdate0,
1600  &CoinAbcScatterUpdate1Subtract,
1601  &CoinAbcScatterUpdate2Subtract,
1602  &CoinAbcScatterUpdate3Subtract,
1603  &CoinAbcScatterUpdate4Subtract,
1604  &CoinAbcScatterUpdate5Subtract,
1605  &CoinAbcScatterUpdate6Subtract,
1606  &CoinAbcScatterUpdate7Subtract,
1607  &CoinAbcScatterUpdate8Subtract};
1608scatterUpdate AbcScatterHighSubtract[]={
1609  &CoinAbcScatterUpdate4NSubtract,
1610  &CoinAbcScatterUpdate4NPlus1Subtract,
1611  &CoinAbcScatterUpdate4NPlus2Subtract,
1612  &CoinAbcScatterUpdate4NPlus3Subtract};
1613scatterUpdate AbcScatterLowAdd[]={
1614  &CoinAbcScatterUpdate0,
1615  &CoinAbcScatterUpdate1Add,
1616  &CoinAbcScatterUpdate2Add,
1617  &CoinAbcScatterUpdate3Add,
1618  &CoinAbcScatterUpdate4Add,
1619  &CoinAbcScatterUpdate5Add,
1620  &CoinAbcScatterUpdate6Add,
1621  &CoinAbcScatterUpdate7Add,
1622  &CoinAbcScatterUpdate8Add};
1623scatterUpdate AbcScatterHighAdd[]={
1624  &CoinAbcScatterUpdate4NAdd,
1625  &CoinAbcScatterUpdate4NPlus1Add,
1626  &CoinAbcScatterUpdate4NPlus2Add,
1627  &CoinAbcScatterUpdate4NPlus3Add};
1628#endif
1629#include "CoinPragma.hpp"
1630
1631#include <math.h>
1632
1633#include <cfloat>
1634#include <cassert>
1635#include <string>
1636#include <stdio.h>
1637#include <iostream>
1638#include "CoinAbcCommonFactorization.hpp"
1639#if 1
1640#if AVX2==1
1641#include "emmintrin.h"
1642#elif AVX2==2
1643#include <immintrin.h>
1644#elif AVX2==3
1645#include "avx2intrin.h"
1646#endif
1647#endif
1648// cilk seems a bit fragile
1649#define CILK_FRAGILE 0
1650#if CILK_FRAGILE>1
1651#undef cilk_spawn
1652#undef cilk_sync
1653#define cilk_spawn
1654#define cilk_sync
1655#define ONWARD 0
1656#elif CILK_FRAGILE==1
1657#define ONWARD 0
1658#else
1659#define ONWARD 1
1660#endif
1661#define BLOCKING1 8 // factorization strip
1662#define BLOCKING2 8 // dgemm recursive
1663#define BLOCKING3 32 // dgemm parallel
1664void CoinAbcDgemm(int m, int n, int k, double * COIN_RESTRICT a,int lda,
1665                          double * COIN_RESTRICT b,double * COIN_RESTRICT c
1666#if ABC_PARALLEL==2
1667                          ,int parallelMode
1668#endif
1669)
1670{
1671  assert ((m&(BLOCKING8-1))==0&&(n&(BLOCKING8-1))==0&&(k&(BLOCKING8-1))==0);
1672  /* entry for column j and row i (when multiple of BLOCKING8)
1673     is at aBlocked+j*m+i*BLOCKING8
1674  */
1675  // k is always smallish
1676  if (m<=BLOCKING8&&n<=BLOCKING8) {
1677    assert (m==BLOCKING8&&n==BLOCKING8&&k==BLOCKING8);
1678    double * COIN_RESTRICT aBase2=a;
1679    double * COIN_RESTRICT bBase2=b;
1680    double * COIN_RESTRICT cBase2=c;
1681    for (int j=0;j<BLOCKING8;j++) {
1682      double * COIN_RESTRICT aBase=aBase2;
1683#if AVX2!=2
1684#if 1
1685      double c0=cBase2[0];
1686      double c1=cBase2[1];
1687      double c2=cBase2[2];
1688      double c3=cBase2[3];
1689      double c4=cBase2[4];
1690      double c5=cBase2[5];
1691      double c6=cBase2[6];
1692      double c7=cBase2[7];
1693      for (int l=0;l<BLOCKING8;l++) {
1694        double bValue = bBase2[l];
1695        if (bValue) {
1696          c0 -= bValue*aBase[0];
1697          c1 -= bValue*aBase[1];
1698          c2 -= bValue*aBase[2];
1699          c3 -= bValue*aBase[3];
1700          c4 -= bValue*aBase[4];
1701          c5 -= bValue*aBase[5];
1702          c6 -= bValue*aBase[6];
1703          c7 -= bValue*aBase[7];
1704        }
1705        aBase += BLOCKING8;
1706      }
1707      cBase2[0]=c0;
1708      cBase2[1]=c1;
1709      cBase2[2]=c2;
1710      cBase2[3]=c3;
1711      cBase2[4]=c4;
1712      cBase2[5]=c5;
1713      cBase2[6]=c6;
1714      cBase2[7]=c7;
1715#else
1716      for (int l=0;l<BLOCKING8;l++) {
1717        double bValue = bBase2[l];
1718        if (bValue) {
1719          for (int i=0;i<BLOCKING8;i++) {
1720            cBase2[i] -= bValue*aBase[i];
1721          }
1722        }
1723        aBase += BLOCKING8;
1724      }
1725#endif
1726#else
1727      //__m256d c0=_mm256_load_pd(cBase2);
1728      __m256d c0=*reinterpret_cast<__m256d *>(cBase2);
1729      //__m256d c1=_mm256_load_pd(cBase2+4);
1730      __m256d c1=*reinterpret_cast<__m256d *>(cBase2+4);
1731      for (int l=0;l<BLOCKING8;l++) {
1732        //__m256d bb = _mm256_broadcast_sd(bBase2+l);
1733        __m256d bb = static_cast<__m256d> (__builtin_ia32_vbroadcastsd256 (bBase2+l));
1734        //__m256d a0 = _mm256_load_pd(aBase);
1735        __m256d a0=*reinterpret_cast<__m256d *>(aBase);
1736        //__m256d a1 = _mm256_load_pd(aBase+4);
1737        __m256d a1=*reinterpret_cast<__m256d *>(aBase+4);
1738        c0 -= bb*a0;
1739        c1 -= bb*a1;
1740        aBase += BLOCKING8;
1741      }
1742      //_mm256_store_pd (cBase2, c0);
1743      *reinterpret_cast<__m256d *>(cBase2)=c0;
1744      //_mm256_store_pd (cBase2+4, c1);
1745      *reinterpret_cast<__m256d *>(cBase2+4)=c1;
1746#endif
1747      bBase2 += BLOCKING8;
1748      cBase2 += BLOCKING8;
1749    }
1750  } else if (m>n) {
1751    // make sure mNew1 multiple of BLOCKING8
1752#if BLOCKING8==8
1753    int mNew1=((m+15)>>4)<<3;
1754#elif BLOCKING8==4
1755    int mNew1=((m+7)>>3)<<2;
1756#elif BLOCKING8==2
1757    int mNew1=((m+3)>>2)<<1;
1758#else
1759    abort();
1760#endif
1761    assert (mNew1>0&&m-mNew1>0);
1762#if ABC_PARALLEL==2
1763    if (mNew1<=BLOCKING3||!parallelMode) {
1764#endif
1765      //printf("splitMa mNew1 %d\n",mNew1);
1766      CoinAbcDgemm(mNew1,n,k,a,lda,b,c
1767#if ABC_PARALLEL==2
1768                          ,0
1769#endif
1770);
1771      //printf("splitMb mNew1 %d\n",mNew1);
1772      CoinAbcDgemm(m-mNew1,n,k,a+mNew1*BLOCKING8,lda,b,c+mNew1*BLOCKING8
1773#if ABC_PARALLEL==2
1774                          ,0
1775#endif
1776);
1777#if ABC_PARALLEL==2
1778    } else {
1779      //printf("splitMa mNew1 %d\n",mNew1);
1780      cilk_spawn CoinAbcDgemm(mNew1,n,k,a,lda,b,c,ONWARD);
1781      //printf("splitMb mNew1 %d\n",mNew1);
1782      CoinAbcDgemm(m-mNew1,n,k,a+mNew1*BLOCKING8,lda,b,c+mNew1*BLOCKING8,ONWARD);
1783      cilk_sync;
1784    }
1785#endif
1786  } else {
1787    // make sure nNew1 multiple of BLOCKING8
1788#if BLOCKING8==8
1789    int nNew1=((n+15)>>4)<<3;
1790#elif BLOCKING8==4
1791    int nNew1=((n+7)>>3)<<2;
1792#elif BLOCKING8==2
1793    int nNew1=((n+3)>>2)<<1;
1794#else
1795    abort();
1796#endif
1797    assert (nNew1>0&&n-nNew1>0);
1798#if ABC_PARALLEL==2
1799    if (nNew1<=BLOCKING3||!parallelMode) {
1800#endif
1801      //printf("splitNa nNew1 %d\n",nNew1);
1802      CoinAbcDgemm(m,nNew1,k,a,lda,b,c
1803#if ABC_PARALLEL==2
1804                          ,0
1805#endif
1806                    );
1807      //printf("splitNb nNew1 %d\n",nNew1);
1808      CoinAbcDgemm(m,n-nNew1,k,a,lda,b+lda*nNew1,c+lda*nNew1
1809#if ABC_PARALLEL==2
1810                          ,0
1811#endif
1812                    );
1813#if ABC_PARALLEL==2
1814    } else {
1815      //printf("splitNa nNew1 %d\n",nNew1);
1816      cilk_spawn CoinAbcDgemm(m,nNew1,k,a,lda,b,c,ONWARD);
1817      //printf("splitNb nNew1 %d\n",nNew1);
1818      CoinAbcDgemm(m,n-nNew1,k,a,lda,b+lda*nNew1,c+lda*nNew1,ONWARD);
1819      cilk_sync;
1820    }
1821#endif
1822  }
1823}
1824#ifdef ABC_LONG_FACTORIZATION
1825// Start long double version
1826void CoinAbcDgemm(int m, int n, int k, long double * COIN_RESTRICT a,int lda,
1827                          long double * COIN_RESTRICT b,long double * COIN_RESTRICT c
1828#if ABC_PARALLEL==2
1829                          ,int parallelMode
1830#endif
1831)
1832{
1833  assert ((m&(BLOCKING8-1))==0&&(n&(BLOCKING8-1))==0&&(k&(BLOCKING8-1))==0);
1834  /* entry for column j and row i (when multiple of BLOCKING8)
1835     is at aBlocked+j*m+i*BLOCKING8
1836  */
1837  // k is always smallish
1838  if (m<=BLOCKING8&&n<=BLOCKING8) {
1839    assert (m==BLOCKING8&&n==BLOCKING8&&k==BLOCKING8);
1840    long double * COIN_RESTRICT aBase2=a;
1841    long double * COIN_RESTRICT bBase2=b;
1842    long double * COIN_RESTRICT cBase2=c;
1843    for (int j=0;j<BLOCKING8;j++) {
1844      long double * COIN_RESTRICT aBase=aBase2;
1845#if AVX2!=2
1846#if 1
1847      long double c0=cBase2[0];
1848      long double c1=cBase2[1];
1849      long double c2=cBase2[2];
1850      long double c3=cBase2[3];
1851      long double c4=cBase2[4];
1852      long double c5=cBase2[5];
1853      long double c6=cBase2[6];
1854      long double c7=cBase2[7];
1855      for (int l=0;l<BLOCKING8;l++) {
1856        long double bValue = bBase2[l];
1857        if (bValue) {
1858          c0 -= bValue*aBase[0];
1859          c1 -= bValue*aBase[1];
1860          c2 -= bValue*aBase[2];
1861          c3 -= bValue*aBase[3];
1862          c4 -= bValue*aBase[4];
1863          c5 -= bValue*aBase[5];
1864          c6 -= bValue*aBase[6];
1865          c7 -= bValue*aBase[7];
1866        }
1867        aBase += BLOCKING8;
1868      }
1869      cBase2[0]=c0;
1870      cBase2[1]=c1;
1871      cBase2[2]=c2;
1872      cBase2[3]=c3;
1873      cBase2[4]=c4;
1874      cBase2[5]=c5;
1875      cBase2[6]=c6;
1876      cBase2[7]=c7;
1877#else
1878      for (int l=0;l<BLOCKING8;l++) {
1879        long double bValue = bBase2[l];
1880        if (bValue) {
1881          for (int i=0;i<BLOCKING8;i++) {
1882            cBase2[i] -= bValue*aBase[i];
1883          }
1884        }
1885        aBase += BLOCKING8;
1886      }
1887#endif
1888#else
1889      //__m256d c0=_mm256_load_pd(cBase2);
1890      __m256d c0=*reinterpret_cast<__m256d *>(cBase2);
1891      //__m256d c1=_mm256_load_pd(cBase2+4);
1892      __m256d c1=*reinterpret_cast<__m256d *>(cBase2+4);
1893      for (int l=0;l<BLOCKING8;l++) {
1894        //__m256d bb = _mm256_broadcast_sd(bBase2+l);
1895        __m256d bb = static_cast<__m256d> (__builtin_ia32_vbroadcastsd256 (bBase2+l));
1896        //__m256d a0 = _mm256_load_pd(aBase);
1897        __m256d a0=*reinterpret_cast<__m256d *>(aBase);
1898        //__m256d a1 = _mm256_load_pd(aBase+4);
1899        __m256d a1=*reinterpret_cast<__m256d *>(aBase+4);
1900        c0 -= bb*a0;
1901        c1 -= bb*a1;
1902        aBase += BLOCKING8;
1903      }
1904      //_mm256_store_pd (cBase2, c0);
1905      *reinterpret_cast<__m256d *>(cBase2)=c0;
1906      //_mm256_store_pd (cBase2+4, c1);
1907      *reinterpret_cast<__m256d *>(cBase2+4)=c1;
1908#endif
1909      bBase2 += BLOCKING8;
1910      cBase2 += BLOCKING8;
1911    }
1912  } else if (m>n) {
1913    // make sure mNew1 multiple of BLOCKING8
1914#if BLOCKING8==8
1915    int mNew1=((m+15)>>4)<<3;
1916#elif BLOCKING8==4
1917    int mNew1=((m+7)>>3)<<2;
1918#elif BLOCKING8==2
1919    int mNew1=((m+3)>>2)<<1;
1920#else
1921    abort();
1922#endif
1923    assert (mNew1>0&&m-mNew1>0);
1924#if ABC_PARALLEL==2
1925    if (mNew1<=BLOCKING3||!parallelMode) {
1926#endif
1927      //printf("splitMa mNew1 %d\n",mNew1);
1928      CoinAbcDgemm(mNew1,n,k,a,lda,b,c
1929#if ABC_PARALLEL==2
1930                          ,0
1931#endif
1932);
1933      //printf("splitMb mNew1 %d\n",mNew1);
1934      CoinAbcDgemm(m-mNew1,n,k,a+mNew1*BLOCKING8,lda,b,c+mNew1*BLOCKING8
1935#if ABC_PARALLEL==2
1936                          ,0
1937#endif
1938);
1939#if ABC_PARALLEL==2
1940    } else {
1941      //printf("splitMa mNew1 %d\n",mNew1);
1942      cilk_spawn CoinAbcDgemm(mNew1,n,k,a,lda,b,c,ONWARD);
1943      //printf("splitMb mNew1 %d\n",mNew1);
1944      CoinAbcDgemm(m-mNew1,n,k,a+mNew1*BLOCKING8,lda,b,c+mNew1*BLOCKING8,ONWARD);
1945      cilk_sync;
1946    }
1947#endif
1948  } else {
1949    // make sure nNew1 multiple of BLOCKING8
1950#if BLOCKING8==8
1951    int nNew1=((n+15)>>4)<<3;
1952#elif BLOCKING8==4
1953    int nNew1=((n+7)>>3)<<2;
1954#elif BLOCKING8==2
1955    int nNew1=((n+3)>>2)<<1;
1956#else
1957    abort();
1958#endif
1959    assert (nNew1>0&&n-nNew1>0);
1960#if ABC_PARALLEL==2
1961    if (nNew1<=BLOCKING3||!parallelMode) {
1962#endif
1963      //printf("splitNa nNew1 %d\n",nNew1);
1964      CoinAbcDgemm(m,nNew1,k,a,lda,b,c
1965#if ABC_PARALLEL==2
1966                          ,0
1967#endif
1968                    );
1969      //printf("splitNb nNew1 %d\n",nNew1);
1970      CoinAbcDgemm(m,n-nNew1,k,a,lda,b+lda*nNew1,c+lda*nNew1
1971#if ABC_PARALLEL==2
1972                          ,0
1973#endif
1974                    );
1975#if ABC_PARALLEL==2
1976    } else {
1977      //printf("splitNa nNew1 %d\n",nNew1);
1978      cilk_spawn CoinAbcDgemm(m,nNew1,k,a,lda,b,c,ONWARD);
1979      //printf("splitNb nNew1 %d\n",nNew1);
1980      CoinAbcDgemm(m,n-nNew1,k,a,lda,b+lda*nNew1,c+lda*nNew1,ONWARD);
1981      cilk_sync;
1982    }
1983#endif
1984  }
1985}
1986#endif
1987// From Intel site
1988// get AVX intrinsics 
1989#include <immintrin.h> 
1990// get CPUID capability 
1991//#include <intrin.h> 
1992#define cpuid(func,ax,bx,cx,dx)\
1993        __asm__ __volatile__ ("cpuid":\
1994        "=a" (ax), "=b" (bx), "=c" (cx), "=d" (dx) : "a" (func)); 
1995// written for clarity, not conciseness 
1996#define OSXSAVEFlag (1UL<<27) 
1997#define AVXFlag     ((1UL<<28)|OSXSAVEFlag) 
1998#define VAESFlag    ((1UL<<25)|AVXFlag|OSXSAVEFlag) 
1999#define FMAFlag     ((1UL<<12)|AVXFlag|OSXSAVEFlag) 
2000#define CLMULFlag   ((1UL<< 1)|AVXFlag|OSXSAVEFlag) 
2001   
2002bool DetectFeature(unsigned int feature) 
2003{ 
2004  int CPUInfo[4]; //, InfoType=1, ECX = 1; 
2005  //__cpuidex(CPUInfo, 1, 1);       // read the desired CPUID format 
2006  cpuid(1,CPUInfo[0],CPUInfo[1],CPUInfo[2],CPUInfo[3]);
2007  unsigned int ECX = CPUInfo[2];  // the output of CPUID in the ECX register.   
2008  if ((ECX & feature) != feature) // Missing feature   
2009    return false;   
2010#if 0
2011  long int val = _xgetbv(0);       // read XFEATURE_ENABLED_MASK register 
2012  if ((val&6) != 6)               // check OS has enabled both XMM and YMM support. 
2013    return false;   
2014#endif
2015  return true; 
2016} 
2017#endif
Note: See TracBrowser for help on using the repository browser.