xref: /aosp_15_r20/external/coreboot/src/vendorcode/cavium/bdk/libdram/dram-tune-ddr3.c (revision b9411a12aaaa7e1e6a6fb7c5e057f44ee179a49c)
1 /***********************license start***********************************
2 * Copyright (c) 2003-2017  Cavium Inc. ([email protected]). All rights
3 * reserved.
4 *
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are
8 * met:
9 *
10 *   * Redistributions of source code must retain the above copyright
11 *     notice, this list of conditions and the following disclaimer.
12 *
13 *   * Redistributions in binary form must reproduce the above
14 *     copyright notice, this list of conditions and the following
15 *     disclaimer in the documentation and/or other materials provided
16 *     with the distribution.
17 *
18 *   * Neither the name of Cavium Inc. nor the names of
19 *     its contributors may be used to endorse or promote products
20 *     derived from this software without specific prior written
21 *     permission.
22 *
23 * This Software, including technical data, may be subject to U.S. export
24 * control laws, including the U.S. Export Administration Act and its
25 * associated regulations, and may be subject to export or import
26 * regulations in other countries.
27 *
28 * TO THE MAXIMUM EXTENT PERMITTED BY LAW, THE SOFTWARE IS PROVIDED "AS IS"
29 * AND WITH ALL FAULTS AND CAVIUM INC. MAKES NO PROMISES, REPRESENTATIONS OR
30 * WARRANTIES, EITHER EXPRESS, IMPLIED, STATUTORY, OR OTHERWISE, WITH RESPECT
31 * TO THE SOFTWARE, INCLUDING ITS CONDITION, ITS CONFORMITY TO ANY
32 * REPRESENTATION OR DESCRIPTION, OR THE EXISTENCE OF ANY LATENT OR PATENT
33 * DEFECTS, AND CAVIUM SPECIFICALLY DISCLAIMS ALL IMPLIED (IF ANY) WARRANTIES
34 * OF TITLE, MERCHANTABILITY, NONINFRINGEMENT, FITNESS FOR A PARTICULAR
35 * PURPOSE, LACK OF VIRUSES, ACCURACY OR COMPLETENESS, QUIET ENJOYMENT,
36 * QUIET POSSESSION OR CORRESPONDENCE TO DESCRIPTION. THE ENTIRE  RISK
37 * ARISING OUT OF USE OR PERFORMANCE OF THE SOFTWARE LIES WITH YOU.
38 ***********************license end**************************************/
39 #include <bdk.h>
40 #include "dram-internal.h"
41 
42 #include <string.h>
43 #include <lame_string.h>                /* for strtoul */
44 #include <libbdk-hal/bdk-atomic.h>
45 #include <libbdk-hal/bdk-clock.h>
46 #include <libbdk-hal/bdk-rng.h>
47 #include <libbdk-os/bdk-init.h>
48 
49 // if enhanced verbosity levels are defined, use them
50 #if defined(VB_PRT)
51 #define ddr_print2(format, ...) VB_PRT(VBL_FAE,  format, ##__VA_ARGS__)
52 #define ddr_print3(format, ...) VB_PRT(VBL_TME,  format, ##__VA_ARGS__)
53 #define ddr_print4(format, ...) VB_PRT(VBL_DEV,  format, ##__VA_ARGS__)
54 #define ddr_print5(format, ...) VB_PRT(VBL_DEV3, format, ##__VA_ARGS__)
55 #else
56 #define ddr_print2 ddr_print
57 #define ddr_print4 ddr_print
58 #define ddr_print5 ddr_print
59 #endif
60 
61 static  int64_t test_dram_byte_threads_done;
62 static uint64_t test_dram_byte_threads_errs;
63 static uint64_t test_dram_byte_lmc_errs[4];
64 
65 #if 0
66 /*
67  * Suggested testing patterns.
68  */
69 static const uint64_t test_pattern_2[] = {
70     0xFFFFFFFFFFFFFFFFULL,
71     0xAAAAAAAAAAAAAAAAULL,
72     0xFFFFFFFFFFFFFFFFULL,
73     0xAAAAAAAAAAAAAAAAULL,
74     0x5555555555555555ULL,
75     0xAAAAAAAAAAAAAAAAULL,
76     0xFFFFFFFFFFFFFFFFULL,
77     0xAAAAAAAAAAAAAAAAULL,
78     0xFFFFFFFFFFFFFFFFULL,
79     0x5555555555555555ULL,
80     0xFFFFFFFFFFFFFFFFULL,
81     0x5555555555555555ULL,
82     0xAAAAAAAAAAAAAAAAULL,
83     0x5555555555555555ULL,
84     0xFFFFFFFFFFFFFFFFULL,
85     0x5555555555555555ULL,
86 };
87  /*
88  *  or possibly
89  */
90 static const uint64_t test_pattern_3[] = {
91     0xFDFDFDFDFDFDFDFDULL,
92     0x8787878787878787ULL,
93     0xFEFEFEFEFEFEFEFEULL,
94     0xC3C3C3C3C3C3C3C3ULL,
95     0x7F7F7F7F7F7F7F7FULL,
96     0xE1E1E1E1E1E1E1E1ULL,
97     0xBFBFBFBFBFBFBFBFULL,
98     0xF0F0F0F0F0F0F0F0ULL,
99     0xDFDFDFDFDFDFDFDFULL,
100     0x7878787878787878ULL,
101     0xEFEFEFEFEFEFEFEFULL,
102     0x3C3C3C3C3C3C3C3CULL,
103     0xF7F7F7F7F7F7F7F7ULL,
104     0x1E1E1E1E1E1E1E1EULL,
105     0xFBFBFBFBFBFBFBFBULL,
106     0x0F0F0F0F0F0F0F0FULL,
107 };
108 
109 static const uint64_t test_pattern_1[] = {
110     0xAAAAAAAAAAAAAAAAULL,
111     0x5555555555555555ULL,
112     0xAAAAAAAAAAAAAAAAULL,
113     0x5555555555555555ULL,
114     0xAAAAAAAAAAAAAAAAULL,
115     0x5555555555555555ULL,
116     0xAAAAAAAAAAAAAAAAULL,
117     0x5555555555555555ULL,
118     0xAAAAAAAAAAAAAAAAULL,
119     0x5555555555555555ULL,
120     0xAAAAAAAAAAAAAAAAULL,
121     0x5555555555555555ULL,
122     0xAAAAAAAAAAAAAAAAULL,
123     0x5555555555555555ULL,
124     0xAAAAAAAAAAAAAAAAULL,
125     0x5555555555555555ULL,
126 #if 0 // only need a cacheline size
127     0xAAAAAAAAAAAAAAAAULL,
128     0x5555555555555555ULL,
129     0xAAAAAAAAAAAAAAAAULL,
130     0x5555555555555555ULL,
131     0xAAAAAAAAAAAAAAAAULL,
132     0x5555555555555555ULL,
133     0xAAAAAAAAAAAAAAAAULL,
134     0x5555555555555555ULL,
135     0xAAAAAAAAAAAAAAAAULL,
136     0x5555555555555555ULL,
137     0xAAAAAAAAAAAAAAAAULL,
138     0x5555555555555555ULL,
139     0xAAAAAAAAAAAAAAAAULL,
140     0x5555555555555555ULL,
141     0xAAAAAAAAAAAAAAAAULL,
142     0x5555555555555555ULL,
143 #endif
144 };
145 
146 // setup default for test pattern array
147 static const uint64_t *dram_tune_test_pattern = test_pattern_1;
148 #endif
149 
150 // set this to 1 to shorten the testing to exit when all byte lanes have errors
151 // having this at 0 forces the testing to take place over the entire range every iteration,
152 // hopefully ensuring an even load on the memory subsystem
153 #define EXIT_WHEN_ALL_LANES_HAVE_ERRORS 0
154 
155 #define DEFAULT_TEST_BURSTS 5 // FIXME: this is what works so far...// FIXME: was 7
156 int dram_tune_use_bursts = DEFAULT_TEST_BURSTS;
157 
158 // dram_tune_rank_offset is used to offset the second area used in test_dram_mem_xor.
159 //
160 // If only a single-rank DIMM, the offset will be 256MB from the start of the first area,
161 //  which is more than enough for the restricted looping/address range actually tested...
162 //
163 // If a 2-rank DIMM, the offset will be the size of a rank's address space, so the effect
164 //  will be to have the first and second areas in different ranks on the same DIMM.
165 //
166 // So, we default this to single-rank, and it will be overridden when 2-ranks are detected.
167 //
168 
169 // FIXME: ASSUME that we have DIMMS no less than 4GB in size
170 
171 // offset to first area that avoids any boot stuff in low range (below 256MB)
172 #define AREA_BASE_OFFSET (1ULL << 28) // bit 28 always ON
173 
174 // offset to duplicate area; may coincide with rank 1 base address for 2-rank 4GB DIMM
175 #define AREA_DUPE_OFFSET (1ULL << 31) // bit 31 always ON
176 
177 // defaults to DUPE, but will be set elsewhere to offset to next RANK if multi-rank DIMM
178 static uint64_t dram_tune_rank_offset = AREA_DUPE_OFFSET; // default
179 
180 // defaults to 0, but will be set elsewhere to the address offset to next DIMM if multi-slot
181 static uint64_t dram_tune_dimm_offset = 0; // default
182 
183 
184 static int speed_bin_offset[3] = {25, 20, 15};
185 static int speed_bin_winlen[3] = {70, 60, 60};
186 
187 static int
get_speed_bin(bdk_node_t node,int lmc)188 get_speed_bin(bdk_node_t node, int lmc)
189 {
190     uint32_t mts_speed = (libdram_get_freq_from_pll(node, lmc) / 1000000) * 2;
191     int ret = 0;
192 
193     // FIXME: is this reasonable speed "binning"?
194     if (mts_speed >= 1700) {
195         if (mts_speed >= 2000)
196             ret = 2;
197         else
198             ret = 1;
199     }
200 
201     debug_print("N%d.LMC%d: %s: returning bin %d for MTS %d\n",
202                 node, lmc, __func__, ret, mts_speed);
203 
204     return ret;
205 }
206 
is_low_risk_offset(int speed_bin,int offset)207 static int is_low_risk_offset(int speed_bin, int offset)
208 {
209     return (_abs(offset) <= speed_bin_offset[speed_bin]);
210 }
is_low_risk_winlen(int speed_bin,int winlen)211 static int is_low_risk_winlen(int speed_bin, int winlen)
212 {
213     return (winlen >= speed_bin_winlen[speed_bin]);
214 }
215 
216 #define ENABLE_PREFETCH 0
217 #define ENABLE_WBIL2    1
218 #define ENABLE_SBLKDTY  0
219 
220 #define BDK_SYS_CVMCACHE_INV_L2 "#0,c11,c1,#1"          // L2 Cache Invalidate
221 #define BDK_CACHE_INV_L2(address) { asm volatile ("sys " BDK_SYS_CVMCACHE_INV_L2 ", %0" : : "r" (address)); }
222 
dram_tuning_mem_xor(bdk_node_t node,int lmc,uint64_t p,uint64_t bitmask,uint64_t * xor_data)223 int dram_tuning_mem_xor(bdk_node_t node, int lmc, uint64_t p, uint64_t bitmask, uint64_t *xor_data)
224 {
225     uint64_t p1, p2, d1, d2;
226     uint64_t v, v1;
227     uint64_t p2offset = 0x10000000/* was: dram_tune_rank_offset; */; // FIXME?
228     uint64_t datamask;
229     uint64_t xor;
230     uint64_t i, j, k;
231     uint64_t ii;
232     int errors = 0;
233     //uint64_t index;
234     uint64_t pattern1 = bdk_rng_get_random64();
235     uint64_t pattern2 = 0;
236     uint64_t bad_bits[2] = {0,0};
237 
238 #if ENABLE_SBLKDTY
239     BDK_CSR_MODIFY(c, node, BDK_L2C_CTL, c.s.dissblkdty = 0);
240 #endif
241 
242     // Byte lanes may be clear in the mask to indicate no testing on that lane.
243     datamask = bitmask;
244 
245     // final address must include LMC and node
246     p |= (lmc<<7); /* Map address into proper interface */
247     p = bdk_numa_get_address(node, p); /* Map to node */
248 
249     /* Add offset to both test regions to not clobber boot stuff
250      * when running from L2 for NAND boot.
251      */
252     p += AREA_BASE_OFFSET; // make sure base is out of the way of boot
253 
254 #define II_INC (1ULL << 29)
255 #define II_MAX (1ULL << 31)
256 #define K_INC  (1ULL << 14)
257 #define K_MAX  (1ULL << 20)
258 #define J_INC  (1ULL <<  9)
259 #define J_MAX  (1ULL << 12)
260 #define I_INC  (1ULL <<  3)
261 #define I_MAX  (1ULL <<  7)
262 
263     debug_print("N%d.LMC%d: dram_tuning_mem_xor: phys_addr=0x%lx\n",
264               node, lmc, p);
265 
266 #if 0
267     int ix;
268     // add this loop to fill memory with the test pattern first
269     // loops are ordered so that only entire cachelines are written
270     for (ii = 0; ii < II_MAX; ii += II_INC) { // FIXME? extend the range of memory tested!!
271         for (k = 0; k < K_MAX; k += K_INC) {
272             for (j = 0; j < J_MAX; j += J_INC) {
273                 p1 = p + ii + k + j;
274                 p2 = p1 + p2offset;
275                 for (i = 0, ix = 0; i < I_MAX; i += I_INC, ix++) {
276 
277                     v = dram_tune_test_pattern[ix];
278                     v1 = v; // write the same thing to both areas
279 
280                     __bdk_dram_write64(p1 + i, v);
281                     __bdk_dram_write64(p2 + i, v1);
282 
283                 }
284 #if ENABLE_WBIL2
285                 BDK_CACHE_WBI_L2(p1);
286                 BDK_CACHE_WBI_L2(p2);
287 #endif
288             }
289         }
290     } /* for (ii = 0; ii < (1ULL << 31); ii += (1ULL << 29)) */
291 #endif
292 
293 #if ENABLE_PREFETCH
294     BDK_PREFETCH(p           , BDK_CACHE_LINE_SIZE);
295     BDK_PREFETCH(p + p2offset, BDK_CACHE_LINE_SIZE);
296 #endif
297 
298     // loops are ordered so that only a single 64-bit slot is written to each cacheline at one time,
299     // then the cachelines are forced out; this should maximize read/write traffic
300     for (ii = 0; ii < II_MAX; ii += II_INC) { // FIXME? extend the range of memory tested!!
301         for (k = 0; k < K_MAX; k += K_INC) {
302             for (i = 0; i < I_MAX; i += I_INC) {
303                 for (j = 0; j < J_MAX; j += J_INC) {
304 
305                     p1 = p + ii + k + j;
306                     p2 = p1 + p2offset;
307 
308 #if ENABLE_PREFETCH
309                     if (j < (J_MAX - J_INC)) {
310                         BDK_PREFETCH(p1 + J_INC, BDK_CACHE_LINE_SIZE);
311                         BDK_PREFETCH(p2 + J_INC, BDK_CACHE_LINE_SIZE);
312                     }
313 #endif
314 
315                     v = pattern1 * (p1 + i);
316                     v1 = v; // write the same thing to both areas
317 
318                     __bdk_dram_write64(p1 + i, v);
319                     __bdk_dram_write64(p2 + i, v1);
320 
321 #if ENABLE_WBIL2
322                     BDK_CACHE_WBI_L2(p1);
323                     BDK_CACHE_WBI_L2(p2);
324 #endif
325                 }
326             }
327         }
328     } /* for (ii = 0; ii < (1ULL << 31); ii += (1ULL << 29)) */
329 
330     BDK_DCACHE_INVALIDATE;
331 
332     debug_print("N%d.LMC%d: dram_tuning_mem_xor: done INIT loop\n",
333               node, lmc);
334 
335     /* Make a series of passes over the memory areas. */
336 
337     for (int burst = 0; burst < 1/* was: dram_tune_use_bursts*/; burst++)
338     {
339         uint64_t this_pattern = bdk_rng_get_random64();
340         pattern2 ^= this_pattern;
341 
342         /* XOR the data with a random value, applying the change to both
343          * memory areas.
344          */
345 #if ENABLE_PREFETCH
346         BDK_PREFETCH(p           , BDK_CACHE_LINE_SIZE);
347         BDK_PREFETCH(p + p2offset, BDK_CACHE_LINE_SIZE);
348 #endif
349 
350         for (ii = 0; ii < II_MAX; ii += II_INC) { // FIXME? extend the range of memory tested!!
351             for (k = 0; k < K_MAX; k += K_INC) {
352                 for (i = 0; i < I_MAX; i += I_INC) { // FIXME: rearranged, did not make much difference?
353                     for (j = 0; j < J_MAX; j += J_INC) {
354 
355                         p1 = p + ii + k + j;
356                         p2 = p1 + p2offset;
357 
358 #if ENABLE_PREFETCH
359                         if (j < (J_MAX - J_INC)) {
360                             BDK_PREFETCH(p1 + J_INC, BDK_CACHE_LINE_SIZE);
361                             BDK_PREFETCH(p2 + J_INC, BDK_CACHE_LINE_SIZE);
362                         }
363 #endif
364 
365                         v  = __bdk_dram_read64(p1 + i) ^ this_pattern;
366                         v1 = __bdk_dram_read64(p2 + i) ^ this_pattern;
367 
368 #if ENABLE_WBIL2
369                         BDK_CACHE_INV_L2(p1);
370                         BDK_CACHE_INV_L2(p2);
371 #endif
372 
373                         __bdk_dram_write64(p1 + i, v);
374                         __bdk_dram_write64(p2 + i, v1);
375 
376 #if ENABLE_WBIL2
377                         BDK_CACHE_WBI_L2(p1);
378                         BDK_CACHE_WBI_L2(p2);
379 #endif
380                     }
381                 }
382             }
383         } /* for (ii = 0; ii < (1ULL << 31); ii += (1ULL << 29)) */
384 
385         BDK_DCACHE_INVALIDATE;
386 
387         debug_print("N%d.LMC%d: dram_tuning_mem_xor: done MODIFY loop\n",
388                   node, lmc);
389 
390 #if ENABLE_PREFETCH
391         BDK_PREFETCH(p           , BDK_CACHE_LINE_SIZE);
392         BDK_PREFETCH(p + p2offset, BDK_CACHE_LINE_SIZE);
393 #endif
394 
395         /* Look for differences in the areas. If there is a mismatch, reset
396          * both memory locations with the same pattern. Failing to do so
397          * means that on all subsequent passes the pair of locations remain
398          * out of sync giving spurious errors.
399          */
400         // FIXME: change the loop order so that an entire cache line is compared at one time
401         // FIXME: this is so that a read error that occurs *anywhere* on the cacheline will be caught,
402         // FIXME: rather than comparing only 1 cacheline slot at a time, where an error on a different
403         // FIXME: slot will be missed that time around
404         // Does the above make sense?
405 
406         for (ii = 0; ii < II_MAX; ii += II_INC) { // FIXME? extend the range of memory tested!!
407             for (k = 0; k < K_MAX; k += K_INC) {
408                 for (j = 0; j < J_MAX; j += J_INC) {
409 
410                     p1 = p + ii + k + j;
411                     p2 = p1 + p2offset;
412 
413 #if ENABLE_PREFETCH
414                     if (j < (J_MAX - J_INC)) {
415                         BDK_PREFETCH(p1 + J_INC, BDK_CACHE_LINE_SIZE);
416                         BDK_PREFETCH(p2 + J_INC, BDK_CACHE_LINE_SIZE);
417                     }
418 #endif
419 
420                     // process entire cachelines in the innermost loop
421                     for (i = 0; i < I_MAX; i += I_INC) {
422 
423                         v = ((p1 + i) * pattern1) ^ pattern2; // FIXME: this should predict what we find...???
424                         d1 = __bdk_dram_read64(p1 + i);
425                         d2 = __bdk_dram_read64(p2 + i);
426 
427                         xor = ((d1 ^ v) | (d2 ^ v)) & datamask; // union of error bits only in active byte lanes
428 
429                         if (!xor)
430                             continue;
431 
432                         // accumulate bad bits
433                         bad_bits[0] |= xor;
434                         //bad_bits[1] |= ~mpr_data1 & 0xffUL; // cannot do ECC here
435 
436                         int bybit = 1;
437                         uint64_t bymsk = 0xffULL; // start in byte lane 0
438                         while (xor != 0) {
439                             debug_print("ERROR(%03d): [0x%016lX] [0x%016lX]  expected 0x%016lX d1 %016lX d2 %016lX\n",
440                                         burst, p1, p2, v, d1, d2);
441                             if (xor & bymsk) { // error(s) in this lane
442                                 errors |= bybit; // set the byte error bit
443                                 xor &= ~bymsk; // clear byte lane in error bits
444                                 datamask &= ~bymsk; // clear the byte lane in the mask
445 #if EXIT_WHEN_ALL_LANES_HAVE_ERRORS
446                                 if (datamask == 0) { // nothing left to do
447                                     return errors; // completely done when errors found in all byte lanes in datamask
448                                 }
449 #endif /* EXIT_WHEN_ALL_LANES_HAVE_ERRORS */
450                             }
451                             bymsk <<= 8; // move mask into next byte lane
452                             bybit <<= 1; // move bit into next byte position
453                         }
454                     }
455 #if ENABLE_WBIL2
456                     BDK_CACHE_WBI_L2(p1);
457                     BDK_CACHE_WBI_L2(p2);
458 #endif
459                 }
460             }
461         } /* for (ii = 0; ii < (1ULL << 31); ii += (1ULL << 29)) */
462 
463         debug_print("N%d.LMC%d: dram_tuning_mem_xor: done TEST loop\n",
464                   node, lmc);
465 
466     } /* for (int burst = 0; burst < dram_tune_use_bursts; burst++) */
467 
468     if (xor_data != NULL) { // send the bad bits back...
469         xor_data[0] = bad_bits[0];
470         xor_data[1] = bad_bits[1]; // let it be zeroed
471     }
472 
473 #if ENABLE_SBLKDTY
474     BDK_CSR_MODIFY(c, node, BDK_L2C_CTL, c.s.dissblkdty = 1);
475 #endif
476 
477     return errors;
478 }
479 
480 #undef II_INC
481 #undef II_MAX
482 
483 #define EXTRACT(v, lsb, width) (((v) >> (lsb)) & ((1ull << (width)) - 1))
484 #define LMCNO(address, xbits) (EXTRACT(address, 7, xbits) ^ EXTRACT(address, 20, xbits) ^ EXTRACT(address, 12, xbits))
485 
486 // cores to use
487 #define DEFAULT_USE_CORES 44   // FIXME: was (1 << CORE_BITS)
488 int dram_tune_use_cores = DEFAULT_USE_CORES; // max cores to use, override available
489 int dram_tune_max_cores; // max cores available on a node
490 #define CORE_SHIFT 22          // FIXME: offset into rank_address passed to test_dram_byte
491 
492 typedef void (*__dram_tuning_thread_t)(int arg, void *arg1);
493 
494 typedef struct
495 {
496     bdk_node_t node;
497     int64_t num_lmcs;
498     uint64_t byte_mask;
499 } test_dram_byte_info_t;
500 
501 static int dram_tune_use_xor2 = 1; // FIXME: do NOT default to original mem_xor (LMC-based) code
502 
503 static int
run_dram_tuning_threads(bdk_node_t node,int num_lmcs,uint64_t bytemask)504 run_dram_tuning_threads(bdk_node_t node, int num_lmcs, uint64_t bytemask)
505 {
506     test_dram_byte_info_t test_dram_byte_info;
507     test_dram_byte_info_t *test_info = &test_dram_byte_info;
508     int total_count = 0;
509 
510     test_info->node = node;
511     test_info->num_lmcs = num_lmcs;
512     test_info->byte_mask = bytemask;
513 
514     // init some global data
515     bdk_atomic_set64(&test_dram_byte_threads_done, 0);
516     bdk_atomic_set64((int64_t *)&test_dram_byte_threads_errs, 0);
517     bdk_atomic_set64((int64_t *)&test_dram_byte_lmc_errs[0], 0);
518     bdk_atomic_set64((int64_t *)&test_dram_byte_lmc_errs[1], 0);
519     bdk_atomic_set64((int64_t *)&test_dram_byte_lmc_errs[2], 0);
520     bdk_atomic_set64((int64_t *)&test_dram_byte_lmc_errs[3], 0);
521 
522     /* Start threads for cores on the node */
523     if (bdk_numa_exists(node)) {
524         /* FIXME(dhendrix): We shouldn't hit this. */
525         die("bdk_numa_exists() is non-zero\n");
526     }
527 
528 #if 0
529     /* Wait for threads to finish */
530     while (bdk_atomic_get64(&test_dram_byte_threads_done) < total_count)
531         bdk_thread_yield();
532 #else
533 #define TIMEOUT_SECS 5  // FIXME: long enough so a pass for a given setting will not print
534         /* Wait for threads to finish, with progress */
535         int cur_count;
536         uint64_t cur_time;
537         uint64_t period = bdk_clock_get_rate(bdk_numa_local(), BDK_CLOCK_TIME) * TIMEOUT_SECS; // FIXME?
538         uint64_t timeout = bdk_clock_get_count(BDK_CLOCK_TIME) + period;
539         do {
540 //            bdk_thread_yield();        /* FIXME(dhendrix): don't yield... */
541             cur_count = bdk_atomic_get64(&test_dram_byte_threads_done);
542             cur_time = bdk_clock_get_count(BDK_CLOCK_TIME);
543             if (cur_time >= timeout) {
544                 printf("Waiting for %d cores\n", total_count - cur_count);
545                 timeout = cur_time + period;
546             }
547         } while (cur_count < total_count);
548 #endif
549 
550     // NOTE: this is the summary of errors across all LMCs
551     return (int)bdk_atomic_get64((int64_t *)&test_dram_byte_threads_errs);
552 }
553 
554 /* These variables count the number of ECC errors. They should only be accessed atomically */
555 /* FIXME(dhendrix): redundant declaration in original BDK sources */
556 //extern int64_t __bdk_dram_ecc_single_bit_errors[];
557 extern int64_t __bdk_dram_ecc_double_bit_errors[];
558 
559 #define DEFAULT_SAMPLE_GRAN 3 // sample for errors every N offset values
560 #define MIN_BYTE_OFFSET -63
561 #define MAX_BYTE_OFFSET +63
562 int dram_tune_use_gran = DEFAULT_SAMPLE_GRAN;
563 
564 static int
auto_set_dll_offset(bdk_node_t node,int dll_offset_mode,int num_lmcs,int ddr_interface_64b,int do_tune)565 auto_set_dll_offset(bdk_node_t node, int dll_offset_mode,
566                     int num_lmcs, int ddr_interface_64b,
567                     int do_tune)
568 {
569     int byte_offset;
570     //unsigned short result[9];
571     int byte;
572     int byte_delay_start[4][9];
573     int byte_delay_count[4][9];
574     uint64_t byte_delay_windows [4][9];
575     int byte_delay_best_start[4][9];
576     int byte_delay_best_count[4][9];
577     //int this_rodt;
578     uint64_t ops_sum[4], dclk_sum[4];
579     uint64_t start_dram_dclk[4], stop_dram_dclk[4];
580     uint64_t start_dram_ops[4], stop_dram_ops[4];
581     int errors, tot_errors;
582     int lmc;
583     const char *mode_str = (dll_offset_mode == 2) ? "Read" : "Write";        /* FIXME(dhendrix): const */
584     int mode_is_read = (dll_offset_mode == 2);
585     const char *mode_blk = (dll_offset_mode == 2) ? " " : "";                /* FIXME(dhendrix): const */
586     int start_offset, end_offset, incr_offset;
587 
588     int speed_bin = get_speed_bin(node, 0); // FIXME: just get from LMC0?
589     int needs_review_count = 0;
590 
591     if (dram_tune_use_gran != DEFAULT_SAMPLE_GRAN) {
592         ddr_print2("N%d: Changing sample granularity from %d to %d\n",
593                   node, DEFAULT_SAMPLE_GRAN, dram_tune_use_gran);
594     }
595     // ensure sample is taken at 0
596     start_offset = MIN_BYTE_OFFSET - (MIN_BYTE_OFFSET % dram_tune_use_gran);
597     end_offset   = MAX_BYTE_OFFSET - (MAX_BYTE_OFFSET % dram_tune_use_gran);
598     incr_offset  = dram_tune_use_gran;
599 
600     memset(ops_sum, 0, sizeof(ops_sum));
601     memset(dclk_sum, 0, sizeof(dclk_sum));
602     memset(byte_delay_start, 0, sizeof(byte_delay_start));
603     memset(byte_delay_count, 0, sizeof(byte_delay_count));
604     memset(byte_delay_windows,  0, sizeof(byte_delay_windows));
605     memset(byte_delay_best_start, 0, sizeof(byte_delay_best_start));
606     memset(byte_delay_best_count, 0, sizeof(byte_delay_best_count));
607 
608     // FIXME? consult LMC0 only
609     BDK_CSR_INIT(lmcx_config, node, BDK_LMCX_CONFIG(0));
610     if (lmcx_config.s.rank_ena) { // replace the default offset when there is more than 1 rank...
611         dram_tune_rank_offset = 1ull << (28 + lmcx_config.s.pbank_lsb - lmcx_config.s.rank_ena + (num_lmcs/2));
612         /* FIXME(dhendrix): %lx --> %llx */
613         ddr_print2("N%d: Tuning multiple ranks per DIMM (rank offset 0x%llx).\n", node, dram_tune_rank_offset);
614     }
615     if (lmcx_config.s.init_status & 0x0c) { // bit 2 or 3 set indicates 2 DIMMs
616         dram_tune_dimm_offset = 1ull << (28 + lmcx_config.s.pbank_lsb + (num_lmcs/2));
617         /* FIXME(dhendrix): %lx --> %llx */
618         ddr_print2("N%d: Tuning multiple DIMMs per channel (DIMM offset 0x%llx)\n", node, dram_tune_dimm_offset);
619     }
620 
621     // FIXME? do this for LMC0 only
622     //BDK_CSR_INIT(comp_ctl2, node, BDK_LMCX_COMP_CTL2(0));
623     //this_rodt = comp_ctl2.s.rodt_ctl;
624 
625     // construct the bytemask
626     int bytes_todo = (ddr_interface_64b) ? 0xff : 0x0f;
627     uint64_t bytemask = 0;
628     for (byte = 0; byte < 8; ++byte) {
629         if (bytes_todo & (1 << byte)) {
630             bytemask |= 0xfful << (8*byte); // set the bytes bits in the bytemask
631         }
632     } /* for (byte = 0; byte < 8; ++byte) */
633 
634     // now loop through selected legal values for the DLL byte offset...
635 
636     for (byte_offset = start_offset; byte_offset <= end_offset; byte_offset += incr_offset) {
637 
638         // do the setup on active LMCs
639         for (lmc = 0; lmc < num_lmcs; lmc++) {
640             change_dll_offset_enable(node, lmc, 0);
641 
642             // set all byte lanes at once
643             load_dll_offset(node, lmc, dll_offset_mode, byte_offset, 10 /* All bytes at once */);
644             // but then clear the ECC byte lane so it should be neutral for the test...
645             load_dll_offset(node, lmc, dll_offset_mode, 0, 8);
646 
647             change_dll_offset_enable(node, lmc, 1);
648 
649             // record start cycle CSRs here for utilization measure
650             start_dram_dclk[lmc] = BDK_CSR_READ(node, BDK_LMCX_DCLK_CNT(lmc));
651             start_dram_ops[lmc]  = BDK_CSR_READ(node, BDK_LMCX_OPS_CNT(lmc));
652         } /* for (lmc = 0; lmc < num_lmcs; lmc++) */
653 
654         bdk_watchdog_poke();
655 
656         // run the test(s)
657         // only 1 call should be enough, let the bursts, etc, control the load...
658         run_dram_tuning_threads(node, num_lmcs, bytemask);
659 
660         for (lmc = 0; lmc < num_lmcs; lmc++) {
661             // record stop cycle CSRs here for utilization measure
662             stop_dram_dclk[lmc] = BDK_CSR_READ(node, BDK_LMCX_DCLK_CNT(lmc));
663             stop_dram_ops[lmc]  = BDK_CSR_READ(node, BDK_LMCX_OPS_CNT(lmc));
664 
665             // accumulate...
666             ops_sum[lmc]  += stop_dram_ops[lmc]  - start_dram_ops[lmc];
667             dclk_sum[lmc] += stop_dram_dclk[lmc] - start_dram_dclk[lmc];
668 
669             errors = test_dram_byte_lmc_errs[lmc];
670 
671             // check errors by byte, but not ECC
672             for (byte = 0; byte < 8; ++byte) {
673                 if (!(bytes_todo & (1 << byte))) // is this byte lane to be done
674                     continue; // no
675 
676                 byte_delay_windows[lmc][byte] <<= 1; // always put in a zero
677                 if (errors & (1 << byte)) { // yes, an error in this byte lane
678                     byte_delay_count[lmc][byte] = 0; // stop now always
679                 } else { // no error in this byte lane
680                     if (byte_delay_count[lmc][byte] == 0) { // first success, set run start
681                         byte_delay_start[lmc][byte] = byte_offset;
682                     }
683                     byte_delay_count[lmc][byte] += incr_offset; // bump run length
684 
685                     if (byte_delay_count[lmc][byte] > byte_delay_best_count[lmc][byte]) {
686                         byte_delay_best_count[lmc][byte] = byte_delay_count[lmc][byte];
687                         byte_delay_best_start[lmc][byte] = byte_delay_start[lmc][byte];
688                     }
689                     byte_delay_windows[lmc][byte] |= 1ULL; // for pass, put in a 1
690                 }
691             } /* for (byte = 0; byte < 8; ++byte) */
692 
693             // only print when there are errors and verbose...
694             if (errors) {
695                 debug_print("DLL %s Offset Test %3d: errors 0x%x\n",
696                             mode_str, byte_offset, errors);
697             }
698         } /* for (lmc = 0; lmc < num_lmcs; lmc++) */
699 
700     } /* for (byte_offset=-63; byte_offset<63; byte_offset += incr_offset) */
701 
702     // done with testing, load up and/or print out the offsets we found...
703 
704     // only when margining...
705     if (!do_tune) {
706         printf("  \n");
707         printf("-------------------------------------\n");
708 #if 0
709         uint32_t mts_speed = (libdram_get_freq_from_pll(node, 0) * 2) / 1000000; // FIXME: sample LMC0
710         printf("N%d: Starting %s Timing Margining for %d MT/s.\n", node, mode_str, mts_speed);
711 #else
712         printf("N%d: Starting %s Timing Margining.\n", node, mode_str);
713 #endif
714         printf("  \n");
715     } /* if (!do_tune) */
716 
717     for (lmc = 0; lmc < num_lmcs; lmc++) {
718 #if 1
719         // FIXME FIXME
720         // FIXME: this just makes ECC always show 0
721         byte_delay_best_start[lmc][8] = start_offset;
722         byte_delay_best_count[lmc][8] = end_offset - start_offset + incr_offset;
723 #endif
724 
725         // disable offsets while we load...
726         change_dll_offset_enable(node, lmc, 0);
727 
728         // only when margining...
729         if (!do_tune) {
730             // print the heading
731             printf("  \n");
732             printf("N%d.LMC%d: %s Timing Margin     %s : ", node, lmc, mode_str, mode_blk);
733             printf("     ECC/8 ");
734             for (byte = 7; byte >= 0; byte--) {
735                 printf("    Byte %d ", byte);
736             }
737             printf("\n");
738         } /* if (!do_tune) */
739 
740         // print and load the offset values
741         // print the windows bit arrays
742         // only when margining...
743         if (!do_tune) {
744             printf("N%d.LMC%d: DLL %s Offset Amount %s : ", node, lmc, mode_str, mode_blk);
745         } else {
746             ddr_print("N%d.LMC%d: SW DLL %s Offset Amount %s : ", node, lmc, mode_str, mode_blk);
747         }
748         for (byte = 8; byte >= 0; --byte) { // print in "normal" reverse index order
749 
750             int count = byte_delay_best_count[lmc][byte];
751             if (count == 0)
752                 count = incr_offset; // should make non-tested ECC byte come out 0
753 
754             byte_offset =  byte_delay_best_start[lmc][byte] +
755                 ((count - incr_offset) / 2); // adj by incr
756 
757             if (!do_tune) { // do counting and special flag if margining
758                 int will_need_review = !is_low_risk_winlen(speed_bin, (count - incr_offset)) &&
759                                        !is_low_risk_offset(speed_bin, byte_offset);
760 
761                 printf("%10d%c", byte_offset, (will_need_review) ? '<' :' ');
762 
763                 if (will_need_review)
764                     needs_review_count++;
765             } else { // if just tuning, make the printout less lengthy
766                 ddr_print("%5d ", byte_offset);
767             }
768 
769             // FIXME? should we be able to override this?
770             if (mode_is_read) // for READ offsets, always store what we found
771                 load_dll_offset(node, lmc, dll_offset_mode, byte_offset, byte);
772             else // for WRITE offsets, always store 0
773                 load_dll_offset(node, lmc, dll_offset_mode, 0, byte);
774 
775         }
776         if (!do_tune) {
777             printf("\n");
778         } else {
779             ddr_print("\n");
780         }
781 
782 
783         // re-enable the offsets now that we are done loading
784         change_dll_offset_enable(node, lmc, 1);
785 
786         // only when margining...
787         if (!do_tune) {
788             // print the window sizes
789             printf("N%d.LMC%d: DLL %s Window Length %s : ", node, lmc, mode_str, mode_blk);
790             for (byte = 8; byte >= 0; --byte) { // print in "normal" reverse index order
791                 int count = byte_delay_best_count[lmc][byte];
792                 if (count == 0)
793                     count = incr_offset; // should make non-tested ECC byte come out 0
794 
795                 // do this again since the "needs review" test is an AND...
796                 byte_offset =  byte_delay_best_start[lmc][byte] +
797                     ((count - incr_offset) / 2); // adj by incr
798 
799                 int will_need_review = !is_low_risk_winlen(speed_bin, (count - incr_offset)) &&
800                     !is_low_risk_offset(speed_bin, byte_offset);
801 
802                 printf("%10d%c", count - incr_offset, (will_need_review) ? '<' :' ');
803             }
804             printf("\n");
805 
806             // print the window extents
807             printf("N%d.LMC%d: DLL %s Window Bounds %s : ", node, lmc, mode_str, mode_blk);
808             for (byte = 8; byte >= 0; --byte) { // print in "normal" reverse index order
809                 int start = byte_delay_best_start[lmc][byte];
810                 int count = byte_delay_best_count[lmc][byte];
811                 if (count == 0)
812                     count = incr_offset; // should make non-tested ECC byte come out 0
813                 printf(" %3d to%3d ", start,
814                        start + count - incr_offset);
815             }
816             printf("\n");
817 #if 0
818             // FIXME: should have a way to force these out...
819             // print the windows bit arrays
820             printf("N%d.LMC%d: DLL %s Window Bitmap%s : ", node, lmc, mode_str, mode_blk);
821             for (byte = 8; byte >= 0; --byte) { // print in "normal" reverse index order
822                 printf("%010lx ", byte_delay_windows[lmc][byte]);
823             }
824             printf("\n");
825 #endif
826         } /* if (!do_tune) */
827     } /* for (lmc = 0; lmc < num_lmcs; lmc++) */
828 
829     // only when margining...
830     if (!do_tune) {
831         // print the Summary line(s) here
832         printf("  \n");
833         printf("N%d: %s Timing Margining Summary : %s ", node, mode_str,
834                (needs_review_count > 0) ? "Needs Review" : "Low Risk");
835         if (needs_review_count > 0)
836             printf("(%d)", needs_review_count);
837         printf("\n");
838 
839         // FIXME??? want to print here: "N0: %s Offsets have been applied already"
840 
841         printf("-------------------------------------\n");
842         printf("  \n");
843     } /* if (!do_tune) */
844 
845     // FIXME: we probably want this only when doing verbose...
846     // finally, print the utilizations all together
847     for (lmc = 0; lmc < num_lmcs; lmc++) {
848         uint64_t percent_x10 = ops_sum[lmc] * 1000 / dclk_sum[lmc];
849         /* FIXME(dhendrix): %lu --> %llu */
850         ddr_print2("N%d.LMC%d: ops %llu, cycles %llu, used %llu.%llu%%\n",
851                   node, lmc, ops_sum[lmc], dclk_sum[lmc], percent_x10 / 10, percent_x10 % 10);
852     } /* for (lmc = 0; lmc < num_lmcs; lmc++) */
853 
854     // FIXME: only when verbose, or only when there are errors?
855     // run the test one last time
856     // print whether there are errors or not, but only when verbose...
857     bdk_watchdog_poke();
858     debug_print("N%d: %s: Start running test one last time\n", node, __func__);
859     tot_errors = run_dram_tuning_threads(node, num_lmcs, bytemask);
860     debug_print("N%d: %s: Finished running test one last time\n", node, __func__);
861     if (tot_errors)
862         ddr_print2("%s Timing Final Test: errors 0x%x\n", mode_str, tot_errors);
863 
864     return (do_tune) ? tot_errors : !!(needs_review_count > 0);
865 }
866 
867 #define USE_L2_WAYS_LIMIT 0 // non-zero to enable L2 ways limiting
868 
869 /*
870  * Automatically adjust the DLL offset for the data bytes
871  */
perform_dll_offset_tuning(bdk_node_t node,int dll_offset_mode,int do_tune)872 int perform_dll_offset_tuning(bdk_node_t node, int dll_offset_mode, int do_tune)
873 {
874     int ddr_interface_64b;
875     int save_ecc_ena[4];
876     bdk_lmcx_config_t lmc_config;
877     int lmc, num_lmcs = __bdk_dram_get_num_lmc(node);
878     const char *s;
879 #if USE_L2_WAYS_LIMIT
880     int ways, ways_print = 0;
881 #endif
882 #if 0
883     int dram_tune_use_rodt = -1, save_rodt[4];
884     bdk_lmcx_comp_ctl2_t comp_ctl2;
885 #endif
886     int loops = 1, loop;
887     uint64_t orig_coremask;
888     int errs = 0;
889 
890     // enable any non-running cores on this node
891     orig_coremask = bdk_get_running_coremask(node);
892     /* FIXME(dhendrix): %lx --> %llx */
893     ddr_print4("N%d: %s: Starting cores (mask was 0x%llx)\n",
894               node, __func__, orig_coremask);
895         /* FIXME(dhendrix): don't call bdk_init_cores(). */
896 //    bdk_init_cores(node, ~0ULL & ~orig_coremask);
897     dram_tune_max_cores = bdk_get_num_running_cores(node);
898 
899     // but use only a certain number of cores, at most what is available
900     if ((s = getenv("ddr_tune_use_cores")) != NULL) {
901         dram_tune_use_cores = strtoul(s, NULL, 0);
902         if (dram_tune_use_cores <= 0) // allow 0 or negative to mean all
903             dram_tune_use_cores = dram_tune_max_cores;
904     }
905     if (dram_tune_use_cores > dram_tune_max_cores)
906         dram_tune_use_cores = dram_tune_max_cores;
907 
908     // see if we want to do the tuning more than once per LMC...
909     if ((s = getenv("ddr_tune_use_loops"))) {
910         loops = strtoul(s, NULL, 0);
911     }
912 
913     // see if we want to change the granularity of the byte_offset sampling
914     if ((s = getenv("ddr_tune_use_gran"))) {
915         dram_tune_use_gran = strtoul(s, NULL, 0);
916     }
917 
918     // allow override of the test repeats (bursts) per thread create
919     if ((s = getenv("ddr_tune_use_bursts")) != NULL) {
920         dram_tune_use_bursts = strtoul(s, NULL, 10);
921     }
922 
923 #if 0
924     // allow override of Read ODT setting just during the tuning run(s)
925     if ((s = getenv("ddr_tune_use_rodt")) != NULL) {
926         int temp = strtoul(s, NULL, 10);
927         // validity check
928         if (temp >= 0 && temp <= 7)
929             dram_tune_use_rodt = temp;
930     }
931 #endif
932 
933 #if 0
934     // allow override of the test pattern
935     // FIXME: a bit simplistic...
936     if ((s = getenv("ddr_tune_use_pattern")) != NULL) {
937         int patno = strtoul(s, NULL, 10);
938         if (patno == 2)
939             dram_tune_test_pattern = test_pattern_2;
940         else if (patno == 3)
941             dram_tune_test_pattern = test_pattern_3;
942         else // all other values use default
943             dram_tune_test_pattern = test_pattern_1;
944     }
945 #endif
946 
947     // allow override of the test mem_xor algorithm
948     if ((s = getenv("ddr_tune_use_xor2")) != NULL) {
949         dram_tune_use_xor2 = !!strtoul(s, NULL, 10);
950     }
951 
952     // print current working values
953     ddr_print2("N%d: Tuning will use %d cores of max %d cores, and use %d repeats.\n",
954                 node, dram_tune_use_cores, dram_tune_max_cores,
955                 dram_tune_use_bursts);
956 
957 #if USE_L2_WAYS_LIMIT
958     // see if L2 ways are limited
959     if ((s = lookup_env_parameter("limit_l2_ways")) != NULL) {
960         ways = strtoul(s, NULL, 10);
961         ways_print = 1;
962     } else {
963         ways = bdk_l2c_get_num_assoc(node);
964     }
965 #endif
966 
967 #if 0
968     // if RODT is to be overridden during tuning, note change
969     if (dram_tune_use_rodt >= 0) {
970         ddr_print("N%d: using RODT %d for tuning.\n",
971                   node, dram_tune_use_rodt);
972     }
973 #endif
974 
975     // FIXME? get flag from LMC0 only
976     lmc_config.u = BDK_CSR_READ(node, BDK_LMCX_CONFIG(0));
977     ddr_interface_64b = !lmc_config.s.mode32b;
978 
979     // do setup for each active LMC
980     debug_print("N%d: %s: starting LMCs setup.\n", node, __func__);
981     for (lmc = 0; lmc < num_lmcs; lmc++) {
982 
983 #if 0
984         // if RODT change, save old and set new here...
985         if (dram_tune_use_rodt >= 0) {
986             comp_ctl2.u = BDK_CSR_READ(node, BDK_LMCX_COMP_CTL2(lmc));
987             save_rodt[lmc] = comp_ctl2.s.rodt_ctl;
988             comp_ctl2.s.rodt_ctl = dram_tune_use_rodt;
989             DRAM_CSR_WRITE(node, BDK_LMCX_COMP_CTL2(lmc), comp_ctl2.u);
990             BDK_CSR_READ(node, BDK_LMCX_COMP_CTL2(lmc));
991         }
992 #endif
993         /* Disable ECC for DRAM tests */
994         lmc_config.u = BDK_CSR_READ(node, BDK_LMCX_CONFIG(lmc));
995         save_ecc_ena[lmc] = lmc_config.s.ecc_ena;
996         lmc_config.s.ecc_ena = 0;
997         DRAM_CSR_WRITE(node, BDK_LMCX_CONFIG(lmc), lmc_config.u);
998         lmc_config.u = BDK_CSR_READ(node, BDK_LMCX_CONFIG(lmc));
999 
1000     } /* for (lmc = 0; lmc < num_lmcs; lmc++) */
1001 
1002 #if USE_L2_WAYS_LIMIT
1003     /* Disable l2 sets for DRAM testing */
1004     limit_l2_ways(node, 0, ways_print);
1005 #endif
1006 
1007     // testing is done on all LMCs simultaneously
1008     // FIXME: for now, loop here to show what happens multiple times
1009     for (loop = 0; loop < loops; loop++) {
1010         /* Perform DLL offset tuning */
1011         errs = auto_set_dll_offset(node, dll_offset_mode, num_lmcs, ddr_interface_64b, do_tune);
1012     }
1013 
1014 #if USE_L2_WAYS_LIMIT
1015     /* Restore the l2 set configuration */
1016     limit_l2_ways(node, ways, ways_print);
1017 #endif
1018 
1019     // perform cleanup on all active LMCs
1020     debug_print("N%d: %s: starting LMCs cleanup.\n", node, __func__);
1021     for (lmc = 0; lmc < num_lmcs; lmc++) {
1022 
1023         /* Restore ECC for DRAM tests */
1024         lmc_config.u = BDK_CSR_READ(node, BDK_LMCX_CONFIG(lmc));
1025         lmc_config.s.ecc_ena = save_ecc_ena[lmc];
1026         DRAM_CSR_WRITE(node, BDK_LMCX_CONFIG(lmc), lmc_config.u);
1027         lmc_config.u = BDK_CSR_READ(node, BDK_LMCX_CONFIG(lmc));
1028 #if 0
1029         // if RODT change, restore old here...
1030         if (dram_tune_use_rodt >= 0) {
1031             comp_ctl2.u = BDK_CSR_READ(node, BDK_LMCX_COMP_CTL2(lmc));
1032             comp_ctl2.s.rodt_ctl = save_rodt[lmc];
1033             DRAM_CSR_WRITE(node, BDK_LMCX_COMP_CTL2(lmc), comp_ctl2.u);
1034             BDK_CSR_READ(node, BDK_LMCX_COMP_CTL2(lmc));
1035         }
1036 #endif
1037         // finally, see if there are any read offset overrides after tuning
1038         // FIXME: provide a way to do write offsets also??
1039         if (dll_offset_mode == 2) {
1040             for (int by = 0; by < 9; by++) {
1041                 if ((s = lookup_env_parameter("ddr%d_tune_byte%d", lmc, by)) != NULL) {
1042                     int dllro = strtoul(s, NULL, 10);
1043                     change_dll_offset_enable(node, lmc, 0);
1044                     load_dll_offset(node, lmc, /* read */2, dllro, by);
1045                     change_dll_offset_enable(node, lmc, 1);
1046                 }
1047             }
1048         }
1049     } /* for (lmc = 0; lmc < num_lmcs; lmc++) */
1050 
1051     // finish up...
1052 
1053 #if 0
1054     // if RODT was overridden during tuning, note restore
1055     if (dram_tune_use_rodt >= 0) {
1056         ddr_print("N%d: restoring RODT %d after tuning.\n",
1057                   node, save_rodt[0]); // FIXME? use LMC0
1058     }
1059 #endif
1060 
1061     // put any cores on this node, that were not running at the start, back into reset
1062     /* FIXME(dhendrix): don't reset cores... */
1063 //    uint64_t reset_coremask = bdk_get_running_coremask(node) & ~orig_coremask;
1064     uint64_t reset_coremask = 0;
1065     if (reset_coremask) {
1066         /* FIXME(dhendrix): %lx --> %llx */
1067         ddr_print4("N%d: %s: Stopping cores 0x%llx\n", node, __func__,
1068                   reset_coremask);
1069         bdk_reset_cores(node, reset_coremask);
1070     } else {
1071         /* FIXME(dhendrix): %lx --> %llx */
1072         ddr_print4("N%d: %s: leaving cores set to 0x%llx\n", node, __func__,
1073                   orig_coremask);
1074     }
1075 
1076     return errs;
1077 
1078 } /* perform_dll_offset_tuning */
1079 
1080 /////////////////////////////////////////////////////////////////////////////////////////////
1081 
1082 /////    HW-assist byte DLL offset tuning   //////
1083 
1084 #if 1
1085 // setup defaults for byte test pattern array
1086 // take these first two from the HRM section 6.9.13
1087 static const uint64_t byte_pattern_0[] = {
1088     0xFFAAFFFFFF55FFFFULL, // GP0
1089     0x55555555AAAAAAAAULL, // GP1
1090     0xAA55AAAAULL,         // GP2
1091 };
1092 static const uint64_t byte_pattern_1[] = {
1093     0xFBF7EFDFBF7FFEFDULL, // GP0
1094     0x0F1E3C78F0E1C387ULL, // GP1
1095     0xF0E1BF7FULL,         // GP2
1096 };
1097 // this is from Andrew via LFSR with PRBS=0xFFFFAAAA
1098 static const uint64_t byte_pattern_2[] = {
1099     0xEE55AADDEE55AADDULL, // GP0
1100     0x55AADDEE55AADDEEULL, // GP1
1101     0x55EEULL,             // GP2
1102 };
1103 // this is from Mike via LFSR with PRBS=0x4A519909
1104 static const uint64_t byte_pattern_3[] = {
1105     0x0088CCEE0088CCEEULL, // GP0
1106     0xBB552211BB552211ULL, // GP1
1107     0xBB00ULL,             // GP2
1108 };
1109 
1110 static const uint64_t *byte_patterns[] = {
1111     byte_pattern_0, byte_pattern_1, byte_pattern_2, byte_pattern_3 // FIXME: use all we have
1112 };
1113 #define NUM_BYTE_PATTERNS ((int)(sizeof(byte_patterns)/sizeof(uint64_t *)))
1114 
1115 #define DEFAULT_BYTE_BURSTS 32 // FIXME: this is what what the longest test usually has
1116 int dram_tune_byte_bursts = DEFAULT_BYTE_BURSTS;
1117 #endif
1118 
1119 static void
setup_hw_pattern(bdk_node_t node,int lmc,const uint64_t * pattern_p)1120 setup_hw_pattern(bdk_node_t node, int lmc, const uint64_t *pattern_p)
1121 {
1122     /*
1123       3) Setup GENERAL_PURPOSE[0-2] registers with the data pattern of choice.
1124       a. GENERAL_PURPOSE0[DATA<63:0>] – sets the initial lower (rising edge) 64 bits of data.
1125       b. GENERAL_PURPOSE1[DATA<63:0>] – sets the initial upper (falling edge) 64 bits of data.
1126       c. GENERAL_PURPOSE2[DATA<15:0>] – sets the initial lower (rising edge <7:0>) and upper
1127       (falling edge <15:8>) ECC data.
1128     */
1129     DRAM_CSR_WRITE(node, BDK_LMCX_GENERAL_PURPOSE0(lmc), pattern_p[0]);
1130     DRAM_CSR_WRITE(node, BDK_LMCX_GENERAL_PURPOSE1(lmc), pattern_p[1]);
1131     DRAM_CSR_WRITE(node, BDK_LMCX_GENERAL_PURPOSE2(lmc), pattern_p[2]);
1132 }
1133 
1134 #define DEFAULT_PRBS 0xFFFFAAAAUL /* FIXME: maybe try 0x4A519909UL */
1135 
1136 static void
setup_lfsr_pattern(bdk_node_t node,int lmc,uint64_t data)1137 setup_lfsr_pattern(bdk_node_t node, int lmc, uint64_t data)
1138 {
1139     uint32_t prbs;
1140     const char *s;
1141 
1142     if ((s = getenv("ddr_lfsr_prbs"))) {
1143         prbs = strtoul(s, NULL, 0);
1144     } else
1145         prbs = DEFAULT_PRBS; // FIXME: from data arg?
1146 
1147     /*
1148       2) DBTRAIN_CTL[LFSR_PATTERN_SEL] = 1
1149          here data comes from the LFSR generating a PRBS pattern
1150          CHAR_CTL.EN = 0
1151          CHAR_CTL.SEL = 0; // for PRBS
1152          CHAR_CTL.DR = 1;
1153          CHAR_CTL.PRBS = setup for whatever type of PRBS to send
1154          CHAR_CTL.SKEW_ON = 1;
1155     */
1156     BDK_CSR_INIT(char_ctl, node, BDK_LMCX_CHAR_CTL(lmc));
1157     char_ctl.s.en      = 0;
1158     char_ctl.s.sel     = 0;
1159     char_ctl.s.dr      = 1;
1160     char_ctl.s.prbs    = prbs;
1161     char_ctl.s.skew_on = 1;
1162     DRAM_CSR_WRITE(node, BDK_LMCX_CHAR_CTL(lmc), char_ctl.u);
1163 }
1164 
1165 /* FIXME(dhendrix): made static to avoid need for prototype */
1166 static int
choose_best_hw_patterns(bdk_node_t node,int lmc,int mode)1167 choose_best_hw_patterns(bdk_node_t node, int lmc, int mode)
1168 {
1169     int new_mode = mode;
1170     const char *s;
1171 
1172     switch (mode) {
1173     case DBTRAIN_TEST: // always choose LFSR if chip supports it
1174         if (! CAVIUM_IS_MODEL(CAVIUM_CN88XX)) {
1175             int lfsr_enable = 1;
1176             if ((s = getenv("ddr_allow_lfsr"))) { // override?
1177                 lfsr_enable = !!strtoul(s, NULL, 0);
1178             }
1179             if (lfsr_enable)
1180                 new_mode = DBTRAIN_LFSR;
1181         }
1182         break;
1183     case DBTRAIN_DBI: // possibly can allow LFSR use?
1184         break;
1185     case DBTRAIN_LFSR: // forced already
1186         if (CAVIUM_IS_MODEL(CAVIUM_CN88XX)) {
1187             ddr_print("ERROR: illegal HW assist mode %d\n", mode);
1188             new_mode = DBTRAIN_TEST;
1189         }
1190         break;
1191     default:
1192         ddr_print("ERROR: unknown HW assist mode %d\n", mode);
1193     }
1194 
1195     if (new_mode != mode)
1196         VB_PRT(VBL_DEV2, "choose_best_hw_patterns: changing mode %d to %d\n", mode, new_mode);
1197 
1198     return new_mode;
1199 }
1200 
1201 int
run_best_hw_patterns(bdk_node_t node,int lmc,uint64_t phys_addr,int mode,uint64_t * xor_data)1202 run_best_hw_patterns(bdk_node_t node, int lmc, uint64_t phys_addr,
1203                      int mode, uint64_t *xor_data)
1204 {
1205     int pattern;
1206     const uint64_t *pattern_p;
1207     int errs, errors = 0;
1208 
1209     // FIXME? always choose LFSR if chip supports it???
1210     mode = choose_best_hw_patterns(node, lmc, mode);
1211 
1212     if (mode == DBTRAIN_LFSR) {
1213             setup_lfsr_pattern(node, lmc, 0);
1214             errors = test_dram_byte_hw(node, lmc, phys_addr, mode, xor_data);
1215             VB_PRT(VBL_DEV2, "%s: LFSR at A:0x%012llx errors 0x%x\n",
1216                    __func__, phys_addr, errors);
1217     } else {
1218         for (pattern = 0; pattern < NUM_BYTE_PATTERNS; pattern++) {
1219             pattern_p = byte_patterns[pattern];
1220             setup_hw_pattern(node, lmc, pattern_p);
1221 
1222             errs = test_dram_byte_hw(node, lmc, phys_addr, mode, xor_data);
1223 
1224             VB_PRT(VBL_DEV2, "%s: PATTERN %d at A:0x%012llx errors 0x%x\n",
1225                    __func__, pattern, phys_addr, errs);
1226 
1227             errors |= errs;
1228         } /* for (pattern = 0; pattern < NUM_BYTE_PATTERNS; pattern++) */
1229     }
1230     return errors;
1231 }
1232 
1233 static void
hw_assist_test_dll_offset(bdk_node_t node,int dll_offset_mode,int lmc,int bytelane)1234 hw_assist_test_dll_offset(bdk_node_t node, int dll_offset_mode,
1235                           int lmc, int bytelane)
1236 {
1237     int byte_offset, new_best_offset[9];
1238     int rank_delay_start[4][9];
1239     int rank_delay_count[4][9];
1240     int rank_delay_best_start[4][9];
1241     int rank_delay_best_count[4][9];
1242     int errors[4];
1243     int num_lmcs = __bdk_dram_get_num_lmc(node);
1244     int rank_mask, rankx, active_ranks;
1245     int pattern;
1246     const uint64_t *pattern_p;
1247     int byte;
1248     const char *mode_str = (dll_offset_mode == 2) ? "Read" : "Write";
1249     int pat_best_offset[9];
1250     uint64_t phys_addr;
1251     int pat_beg, pat_end;
1252     int rank_beg, rank_end;
1253     int byte_lo, byte_hi;
1254     uint64_t hw_rank_offset;
1255     // FIXME? always choose LFSR if chip supports it???
1256     int mode = choose_best_hw_patterns(node, lmc, DBTRAIN_TEST);
1257 
1258     if (bytelane == 0x0A) { // all bytelanes
1259         byte_lo = 0;
1260         byte_hi = 8;
1261     } else { // just 1
1262         byte_lo = byte_hi = bytelane;
1263     }
1264 
1265     BDK_CSR_INIT(lmcx_config, node, BDK_LMCX_CONFIG(lmc));
1266     rank_mask = lmcx_config.s.init_status;
1267     // this should be correct for 1 or 2 ranks, 1 or 2 DIMMs
1268     hw_rank_offset = 1ull << (28 + lmcx_config.s.pbank_lsb - lmcx_config.s.rank_ena + (num_lmcs/2));
1269 
1270     debug_print("N%d: %s: starting LMC%d with rank offset 0x%lx\n",
1271                 node, __func__, lmc, hw_rank_offset);
1272 
1273     // start of pattern loop
1274     // we do the set of tests for each pattern supplied...
1275 
1276     memset(new_best_offset, 0, sizeof(new_best_offset));
1277     for (pattern = 0; pattern < NUM_BYTE_PATTERNS; pattern++) {
1278 
1279         memset(pat_best_offset, 0, sizeof(pat_best_offset));
1280 
1281         if (mode == DBTRAIN_TEST) {
1282             pattern_p = byte_patterns[pattern];
1283             setup_hw_pattern(node, lmc, pattern_p);
1284         } else {
1285             setup_lfsr_pattern(node, lmc, 0);
1286         }
1287 
1288         // now loop through all legal values for the DLL byte offset...
1289 
1290 #define BYTE_OFFSET_INCR 3 // FIXME: make this tunable?
1291 
1292         memset(rank_delay_count, 0, sizeof(rank_delay_count));
1293         memset(rank_delay_start, 0, sizeof(rank_delay_start));
1294         memset(rank_delay_best_count, 0, sizeof(rank_delay_best_count));
1295         memset(rank_delay_best_start, 0, sizeof(rank_delay_best_start));
1296 
1297         for (byte_offset = -63; byte_offset < 64; byte_offset += BYTE_OFFSET_INCR) {
1298 
1299             // do the setup on the active LMC
1300             // set the bytelanes DLL offsets
1301             change_dll_offset_enable(node, lmc, 0);
1302             load_dll_offset(node, lmc, dll_offset_mode, byte_offset, bytelane); // FIXME? bytelane?
1303             change_dll_offset_enable(node, lmc, 1);
1304 
1305             bdk_watchdog_poke();
1306 
1307             // run the test on each rank
1308             // only 1 call per rank should be enough, let the bursts, loops, etc, control the load...
1309 
1310             active_ranks = 0;
1311 
1312             for (rankx = 0; rankx < 4; rankx++) {
1313                 if (!(rank_mask & (1 << rankx)))
1314                     continue;
1315 
1316                 phys_addr = hw_rank_offset * active_ranks;
1317                 // FIXME: now done by test_dram_byte_hw()
1318                 //phys_addr |= (lmc << 7);
1319                 //phys_addr = bdk_numa_get_address(node, phys_addr); // map to node
1320 
1321                 active_ranks++;
1322 
1323                 // NOTE: return is a now a bitmask of the erroring bytelanes..
1324                 errors[rankx] = test_dram_byte_hw(node, lmc, phys_addr, mode, NULL);
1325 
1326                 for (byte = byte_lo; byte <= byte_hi; byte++) { // do bytelane(s)
1327 
1328                     // check errors
1329                     if (errors[rankx] & (1 << byte)) { // yes, an error in the byte lane in this rank
1330 
1331                         ddr_print5("N%d.LMC%d.R%d: Bytelane %d DLL %s Offset Test %3d: Address 0x%012llx errors 0x%x\n",
1332                                    node, lmc, rankx, bytelane, mode_str,
1333                                    byte_offset, phys_addr, errors[rankx]);
1334 
1335                         if (rank_delay_count[rankx][byte] > 0) { // had started run
1336                             ddr_print5("N%d.LMC%d.R%d: Bytelane %d DLL %s Offset Test %3d: stopping a run here\n",
1337                                        node, lmc, rankx, bytelane, mode_str, byte_offset);
1338                             rank_delay_count[rankx][byte] = 0;   // stop now
1339                         }
1340                         // FIXME: else had not started run - nothing else to do?
1341                     } else { // no error in the byte lane
1342                         if (rank_delay_count[rankx][byte] == 0) { // first success, set run start
1343                             ddr_print5("N%d.LMC%d.R%d: Bytelane %d DLL %s Offset Test %3d: starting a run here\n",
1344                                        node, lmc, rankx, bytelane, mode_str, byte_offset);
1345                             rank_delay_start[rankx][byte] = byte_offset;
1346                         }
1347                         rank_delay_count[rankx][byte] += BYTE_OFFSET_INCR; // bump run length
1348 
1349                         // is this now the biggest window?
1350                         if (rank_delay_count[rankx][byte] > rank_delay_best_count[rankx][byte]) {
1351                             rank_delay_best_count[rankx][byte] = rank_delay_count[rankx][byte];
1352                             rank_delay_best_start[rankx][byte] = rank_delay_start[rankx][byte];
1353                             debug_print("N%d.LMC%d.R%d: Bytelane %d DLL %s Offset Test %3d: updating best to %d/%d\n",
1354                                         node, lmc, rankx, bytelane, mode_str, byte_offset,
1355                                         rank_delay_best_start[rankx][byte], rank_delay_best_count[rankx][byte]);
1356                         }
1357                     }
1358                 } /* for (byte = byte_lo; byte <= byte_hi; byte++) */
1359             } /* for (rankx = 0; rankx < 4; rankx++) */
1360 
1361         } /* for (byte_offset = -63; byte_offset < 64; byte_offset += BYTE_OFFSET_INCR) */
1362 
1363         // now choose the best byte_offsets for this pattern according to the best windows of the tested ranks
1364         // calculate offset by constructing an average window from the rank windows
1365         for (byte = byte_lo; byte <= byte_hi; byte++) {
1366 
1367             pat_beg = -999;
1368             pat_end = 999;
1369 
1370             for (rankx = 0; rankx < 4; rankx++) {
1371                 if (!(rank_mask & (1 << rankx)))
1372                     continue;
1373 
1374                 rank_beg = rank_delay_best_start[rankx][byte];
1375                 pat_beg = max(pat_beg, rank_beg);
1376                 rank_end = rank_beg + rank_delay_best_count[rankx][byte] - BYTE_OFFSET_INCR;
1377                 pat_end = min(pat_end, rank_end);
1378 
1379                 ddr_print5("N%d.LMC%d.R%d: Bytelane %d DLL %s Offset Test:  Rank Window %3d:%3d\n",
1380                            node, lmc, rankx, bytelane, mode_str, rank_beg, rank_end);
1381 
1382             } /* for (rankx = 0; rankx < 4; rankx++) */
1383 
1384             pat_best_offset[byte] = (pat_end + pat_beg) / 2;
1385             ddr_print4("N%d.LMC%d: Bytelane %d DLL %s Offset Test:  Pattern %d Average %3d\n",
1386                        node, lmc, byte, mode_str, pattern, pat_best_offset[byte]);
1387 
1388 #if 0
1389             // FIXME: next print the window counts
1390             sprintf(sbuffer, "N%d.LMC%d Pattern %d: DLL %s Offset Count ",
1391                     node, lmc, pattern, mode_str);
1392             printf("%-45s : ", sbuffer);
1393             printf(" %3d", byte_delay_best_count);
1394             printf("\n");
1395 #endif
1396 
1397             new_best_offset[byte] += pat_best_offset[byte]; // sum the pattern averages
1398         } /* for (byte = byte_lo; byte <= byte_hi; byte++) */
1399     } /* for (pattern = 0; pattern < NUM_BYTE_PATTERNS; pattern++) */
1400     // end of pattern loop
1401 
1402     ddr_print("N%d.LMC%d: HW DLL %s Offset Amount   : ",
1403               node, lmc, mode_str);
1404 
1405     for (byte = byte_hi; byte >= byte_lo; --byte) { // print in decending byte index order
1406         new_best_offset[byte] = divide_nint(new_best_offset[byte], NUM_BYTE_PATTERNS); // create the new average NINT
1407 
1408         // print the best offsets from all patterns
1409 
1410         if (bytelane == 0x0A) // print just the offset of all the bytes
1411             ddr_print("%5d ", new_best_offset[byte]);
1412         else
1413             ddr_print("(byte %d) %5d ", byte, new_best_offset[byte]);
1414 
1415 
1416 #if 1
1417         // done with testing, load up the best offsets we found...
1418         change_dll_offset_enable(node, lmc, 0); // disable offsets while we load...
1419         load_dll_offset(node, lmc, dll_offset_mode, new_best_offset[byte], byte);
1420         change_dll_offset_enable(node, lmc, 1); // re-enable the offsets now that we are done loading
1421 #endif
1422     } /* for (byte = byte_hi; byte >= byte_lo; --byte) */
1423 
1424     ddr_print("\n");
1425 
1426 #if 0
1427     // run the test one last time
1428     // print whether there are errors or not, but only when verbose...
1429     tot_errors = run_test_dram_byte_threads(node, num_lmcs, bytemask);
1430     printf("N%d.LMC%d: Bytelane %d DLL %s Offset Final Test: errors 0x%x\n",
1431            node, lmc, bytelane, mode_str, tot_errors);
1432 #endif
1433 }
1434 
1435 /*
1436  * Automatically adjust the DLL offset for the selected bytelane using hardware-assist
1437  */
perform_HW_dll_offset_tuning(bdk_node_t node,int dll_offset_mode,int bytelane)1438 int perform_HW_dll_offset_tuning(bdk_node_t node, int dll_offset_mode, int bytelane)
1439 {
1440     int save_ecc_ena[4];
1441     bdk_lmcx_config_t lmc_config;
1442     int lmc, num_lmcs = __bdk_dram_get_num_lmc(node);
1443     const char *s;
1444     //bdk_lmcx_comp_ctl2_t comp_ctl2;
1445     int loops = 1, loop;
1446 
1447     // see if we want to do the tuning more than once per LMC...
1448     if ((s = getenv("ddr_tune_ecc_loops"))) {
1449         loops = strtoul(s, NULL, 0);
1450     }
1451 
1452     // allow override of the test repeats (bursts)
1453     if ((s = getenv("ddr_tune_byte_bursts")) != NULL) {
1454         dram_tune_byte_bursts = strtoul(s, NULL, 10);
1455     }
1456 
1457     // print current working values
1458     ddr_print2("N%d: H/W Tuning for bytelane %d will use %d loops, %d bursts, and %d patterns.\n",
1459               node, bytelane, loops, dram_tune_byte_bursts,
1460               NUM_BYTE_PATTERNS);
1461 
1462     // FIXME? get flag from LMC0 only
1463     lmc_config.u = BDK_CSR_READ(node, BDK_LMCX_CONFIG(0));
1464 
1465     // do once for each active LMC
1466 
1467     for (lmc = 0; lmc < num_lmcs; lmc++) {
1468 
1469         ddr_print4("N%d: H/W Tuning: starting LMC%d bytelane %d tune.\n", node, lmc, bytelane);
1470 
1471         /* Enable ECC for the HW tests */
1472         // NOTE: we do enable ECC, but the HW tests used will not generate "visible" errors
1473         lmc_config.u = BDK_CSR_READ(node, BDK_LMCX_CONFIG(lmc));
1474         save_ecc_ena[lmc] = lmc_config.s.ecc_ena;
1475         lmc_config.s.ecc_ena = 1;
1476         DRAM_CSR_WRITE(node, BDK_LMCX_CONFIG(lmc), lmc_config.u);
1477         lmc_config.u = BDK_CSR_READ(node, BDK_LMCX_CONFIG(lmc));
1478 
1479         // testing is done on a single LMC at a time
1480         // FIXME: for now, loop here to show what happens multiple times
1481         for (loop = 0; loop < loops; loop++) {
1482             /* Perform DLL offset tuning */
1483             //auto_set_dll_offset(node,  1 /* 1=write */, lmc, bytelane);
1484             hw_assist_test_dll_offset(node,  2 /* 2=read */, lmc, bytelane);
1485         }
1486 
1487         // perform cleanup on active LMC
1488         ddr_print4("N%d: H/W Tuning: finishing LMC%d bytelane %d tune.\n", node, lmc, bytelane);
1489 
1490         /* Restore ECC for DRAM tests */
1491         lmc_config.u = BDK_CSR_READ(node, BDK_LMCX_CONFIG(lmc));
1492         lmc_config.s.ecc_ena = save_ecc_ena[lmc];
1493         DRAM_CSR_WRITE(node, BDK_LMCX_CONFIG(lmc), lmc_config.u);
1494         lmc_config.u = BDK_CSR_READ(node, BDK_LMCX_CONFIG(lmc));
1495 
1496         // finally, see if there are any read offset overrides after tuning
1497         for (int by = 0; by < 9; by++) {
1498             if ((s = lookup_env_parameter("ddr%d_tune_byte%d", lmc, by)) != NULL) {
1499                 int dllro = strtoul(s, NULL, 10);
1500                 change_dll_offset_enable(node, lmc, 0);
1501                 load_dll_offset(node, lmc, 2 /* 2=read */, dllro, by);
1502                 change_dll_offset_enable(node, lmc, 1);
1503             }
1504         }
1505 
1506     } /* for (lmc = 0; lmc < num_lmcs; lmc++) */
1507 
1508     // finish up...
1509 
1510     return 0;
1511 
1512 } /* perform_HW_dll_offset_tuning */
1513