在nvmain的源代码中,可以看到EnergyModel有两种,energy和current。

这可以在config文件中进行配置,比如说PCM_ISSCC_2012_4GB.config中这样配置:

1
EnergyModel energy

同样的,在STTRAM以及RRAM等NVM存储器中也都是energy,如上配置,而在DRAM等易失性存储器中这样配置:

1
EnergyModel current

源码中针对NVM存储器的EnergyModel,采用如下这样一种很简单的累加方式来计算能耗,其中Erd是单个mat的读能耗,在PCM_ISSCC_2012_4GB.config中设定值为0.081200,同时设定Ewr即写能耗(SET或者RESET)为1.684811:

1
2
3
4
5
else
{
subArrayEnergy += p->Erd;
activeEnergy += p->Erd;
}

Write(NVMainRequest *request )

首先搞清楚一些基本知识。

1
2
3
4
5
6
7
8
enum SubArrayState 
{
SUBARRAY_UNKNOWN, /* Unknown state. Uh oh. */
SUBARRAY_OPEN, /* SubArray has an open row */
SUBARRAY_CLOSED, /* SubArray is idle. */
SUBARRAY_PRECHARGING, /* SubArray is precharging and return to SUBARRAY_CLOSED */
SUBARRAY_REFRESHING /* SubArray is refreshing and return to SUBARRAY_CLOSED */
};

SubArrayState有UNKNOWN、OPEN、CLOSED、PRECHARGING、REFRESHING五种状态:

  1. Precharge:对于处于打开状态(这儿打开是指把page内容放入到Sense Amplifier)的page,我们可以进行读写操作,如果不需要再对该page进行读写操作,可以关闭该page, 把该page内容写入bank的行列单元对应的page中,然后DRAM core才能够准备下一个数据访问,以便对其它page进行读写操作。这个关闭操作通过发射一个Precharge命令实现,precharge命令可以关闭某一个bank,也可以关闭rank中所有打开的bank。

  2. Refreshing:DRAM(Dynamic Random Access Memory,即动态随机存取存储器)之所以称为DRAM,就是因为它要不断进行刷新(Refresh)才能保留住数据,因此它是DRAM最重要的操作。Refresh操作与Precharge中重写的操作一样,都是用S-AMP先读再写。但为什么有Precharge操作还要进行Refresh呢?因为Precharge是对一个或所有Bank中的工作行操作,并且是不定期的,而刷新则是有固定的周期,依次对所有行进行操作,以保留那些久久没经历重写的存储体中的数据。

1
2
3
4
5
6
enum WriteMode 
{
WRITE_BACK, /* only modify the row buffer */
WRITE_THROUGH, /* modify both row buffer and cell */
DELAYED_WRITE /* data is stored in a write buffer */
};

WriteMode有三种,WRITE_BACK、WRITE_THROUGH以及DELAYED_WRITE:

  1. WRITE_BACK:只更新行缓冲区;

  2. WRITE_THROUGH:更新行缓冲区和cell;

  3. DELAYED_WRITE:数据被存储在写缓冲区;

Write函数分析

这个函数很重要,大致写一下自己的理解。

1
2
3
4
5
6
7
8
9
10
void NVMAddress::GetTranslatedAddress( uint64_t *addrRow, uint64_t *addrCol, uint64_t *addrBank, uint64_t *addrRank, uint64_t *addrChannel, uint64_t *addrSA )

{
if( addrRow ) *addrRow = row;
if( addrCol ) *addrCol = col;
if( addrBank ) *addrBank = bank;
if( addrRank ) *addrRank = rank;
if( addrChannel ) *addrChannel = channel;
if( addrSA ) *addrSA = subarray;
}

首先进行sanity完整性检查,这一部分的完整性检查必不可少,不能完全信任IsIssuable()而缺少完整性检查。

若nextWrite大于事件队列的当前时钟周期,则Subarray违反写时序限制;若SubArrayState不等于SUBARRAY_OPEN,则试图对非active状态的subarray进行写入而报错;若writeRow不等于openRow,则试图对没有open的行进行写入而报错。

若writeMode为WRITE_THROUGH,则需要更新行缓冲区和cell。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
 if( writeMode == WRITE_THROUGH )
{
encLat = (dataEncoder ? dataEncoder->Write( request ) : 0);
endrLat = UpdateEndurance( request );

/* Count the number of bits modified. */
if( !p->WriteAllBits )
{
//声明一个一维数组bitCountData,大小和数据大小(byte)一致

uint8_t *bitCountData = new uint8_t[request->data.GetSize()];
//对数组bitCountData赋值,新旧数据逐byte进行异或操作,结果存储在数组bitCountData中
for( uint64_t bitCountByte = 0; bitCountByte < request->data.GetSize(); bitCountByte++ )
{
bitCountData[bitCountByte] = request->data.GetByte( bitCountByte ) ^ request->oldData.GetByte( bitCountByte );

}
//这里应该是和后面设计的Count32MLC1函数有关,该函数的操作对象是uint32_t的数据,所以定义bitCountWords为原数据byte数除以4。不过更有可能是因为很多论文比如FPC中指定一个word为4 byte,所以接下来的计算都是以word为操作单位
ncounter_t bitCountWords = request->data.GetSize()/4;
//以32bit为单位计算bitCountData中1的个数,即为bit更新的数目,因为异或操作值不同则为1
ncounter_t numChangedBits = CountBitsMLC1( 1, (uint32_t*)bitCountData, bitCountWords );
//若括号内条件为假,即原数据bit数小于计算所得的更新bit数,则打印错误信息,通过调用abort来终止程序运行
assert( request->data.GetSize()*8 >= numChangedBits );
//未更新bit数目=总bit数-更新bit数目

numUnchangedBits = request->data.GetSize()*8 - numChangedBits;
}
}
1
2
3
4
5
6
7
8
9
10
11
ncounter_t NO_OPT SubArray::Count32MLC1( uint32_t data )
{
//使用神奇操作来计算这个data中1的个数。这个操作够神奇的看不懂,不过以后如果要计算一串32bit二进制数据中1的个数可以照搬

uint32_t count = data;
count = count - ((count >> 1) & 0x55555555);
count = (count & 0x33333333) + ((count >> 2) & 0x33333333);
count = (((count + (count >> 4)) & 0x0f0f0f0f) * 0x01010101) >> 24;

return static_cast<ncounter_t>(count);
}
1
2
3
4
5
6
7
8
9
10
11
12
ncounter_t NO_OPT SubArray::CountBitsMLC1( uint8_t value, uint32_t *data, ncounter_t words )
{
ncounter_t count = 0;
//计算每个word中间新旧数据不同bit数目,并累加得到整个data中间需要更新的bit数
for( ncounter_t i = 0; i < words; i++ )
{
count += Count32MLC1( data[i] );
}
//如果value=1,count=count,否则count=words*32-count
count = (value == 1) ? count : (words*32 - count);
return count;
}

EnergyModel能耗优化计算方案

基于NVM的主存储器设计一种延迟和能量优化写方案。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
/* Calculate energy. */
if( p->EnergyModel == "current" )
{
/* DRAM Model. */
subArrayEnergy += ( ( p->EIDD4W - p->EIDD3N )
* (double)(p->tBURST) ) / (double)(p->BANKS);
burstEnergy += ( ( p->EIDD4W - p->EIDD3N ) *
(double)(p->tBURST) ) / (double)(p->BANKS);
}
else
{
/* Flat energy model. */
//subArrayEnergy += p->Ewr - p->Ewrpb * numUnchangedBits;
uint32_t *rawData;
uint32_t *oldData;
ncounter_t memoryWordSize = 64 * 8 ;
ncounter_t size = 0;
if(request->data.IsCompressed())
{
rawData = reinterpret_cast<uint32_t*>(request->data.comData);
memoryWordSize = request->data.GetComSize()*8;
}
else
{
rawData = reinterpret_cast<uint32_t*>(request->data.rawData);
}
if(request->oldData.IsCompressed())
{
oldData = reinterpret_cast<uint32_t*>(request->oldData.comData);
}
else
{
oldData = reinterpret_cast<uint32_t*>(request->oldData.rawData);
}
size = memoryWordSize / 32;
double energy = 0;
unsigned int i = 0;
ncounter_t i_pos = 0;
uint32_t word;
uint32_t oldWord;
uint32_t mask = 0x00000007;
uint32_t byte;
uint32_t oldByte;
ncounter_t writeCount[8];
ncounter_t EwrTLC[8];
EwrTLC[0] = p->Ewr000;
EwrTLC[1] = p->Ewr001;
EwrTLC[2] = p->Ewr010;
EwrTLC[3] = p->Ewr011;
EwrTLC[4] = p->Ewr100;
EwrTLC[5] = p->Ewr101;
EwrTLC[6] = p->Ewr110;
EwrTLC[7] = p->Ewr111;
for(i_pos = 0; i_pos < 8; i_pos++)
writeCount[i_pos] = 0;
for(i = 0; i < size; i++)
{
word = rawData[i];
oldWord = oldData[i];
for(i_pos = 0; i_pos < 11; i_pos++)
{
byte = word & mask;
oldByte = oldWord & mask;
if(byte != oldByte)
writeCount[byte]++;
word = word >> 3;
oldWord = oldWord >> 3;
}
}
size = memoryWordSize % 32;
if(size != 0)
{
word = rawData[i];
oldWord = oldData[i];
ncounter_t nums_r = size / 3;
if(size % 3 != 0)
nums_r++;
for(i_pos = 0; i_pos < (11-nums_r); i_pos++)
{
word = word >> 3;
oldWord = oldWord >> 3;
}
for(; i_pos < 11; i_pos++)
{
byte = word & mask;
oldByte = oldWord & mask;
if(byte != oldByte)
writeCount[byte]++;
word = word >> 3;
oldWord = oldWord >> 3;
}
}
for(i_pos = 0; i_pos < 8; i_pos++)
{
energy += writeCount[i_pos] * EwrTLC[i_pos];
}
subArrayEnergy += energy;
burstEnergy += p->Ewr;
}