#include "ooo_cpu.h"

#define IPREF_LOOKAHEAD 11	// Prefetch lookahead

#define IPREF_START_DEPTH 1	// Start prefetch injection from this lookahead depth

// PTB configuration
#define LOG_NUM_IPREF_PTB_SETS 11
#define NUM_IPREF_PTB_SETS (1 << LOG_NUM_IPREF_PTB_SETS)
#define NUM_IPREF_PTB_WAYS 14

// Partial tag length of a PTB entry
#define IPREF_TAG_BITS 12
#define IPREF_TAG_MASK ((1 << IPREF_TAG_BITS) - 1)

// Lower target length of a PTB entry
#define IPREF_TARGET_BITS 14
#define IPREF_TARGET_MASK ((1 << IPREF_TARGET_BITS) - 1)

// Upper target pointer length of a PTB entry
#define IPREF_UPPER_TARGET_POINTER_BITS 5

// Number of dictionary entries
#define IPREF_UPPER_TARGET_DICTIONARY_SIZE (1 << IPREF_UPPER_TARGET_POINTER_BITS)

// PTB entry; size = 36 bits
class IPREF_PTBEntry {
   public:
   uint64_t tag;			// 12 bits
   uint32_t lower_target;		// 14 bits
   uint8_t  upper_target_pointer;	//  5 bits
   uint8_t  pht_counter;		//  2 bits
   uint8_t  rrpv;			//  2 bits
   bool     valid;			//  1 bit
};

IPREF_PTBEntry iprefPTB[NUM_IPREF_PTB_SETS][NUM_IPREF_PTB_WAYS];     // 2048x14x36 bits = 126 KB

// Dictionary entry; size = 50 bits
class IPREF_DictionaryEntry {
   public:
   uint64_t upper_target;	// 44 bits
   uint64_t lru;		//  5 bits
   bool     valid;		//  1 bit
};

IPREF_DictionaryEntry iprefDictionary[IPREF_UPPER_TARGET_DICTIONARY_SIZE];  // 32x50 bits = 1600 bits

// Allocate a new target in dictionary if not present already
static uint32_t iprefDictionaryAllocate (uint64_t upper_target)
{
   uint8_t i;

   for (i=0; i<IPREF_UPPER_TARGET_DICTIONARY_SIZE; i++) {
      if (iprefDictionary[i].valid && (iprefDictionary[i].upper_target == upper_target)) {
         // Hit
         break;
      }
   }

   if (i == IPREF_UPPER_TARGET_DICTIONARY_SIZE) {  // Miss: replace
      for (i=0; i<IPREF_UPPER_TARGET_DICTIONARY_SIZE; i++) {
         if (!iprefDictionary[i].valid) break;
      }
      if (i == IPREF_UPPER_TARGET_DICTIONARY_SIZE) {
         uint64_t maxlru = 0;
         int index = -1;
         for (i=0; i<IPREF_UPPER_TARGET_DICTIONARY_SIZE; i++) {
            if (iprefDictionary[i].lru > maxlru) {
               maxlru = iprefDictionary[i].lru;
               index = i;
            }
         }
         i = index;
      }
      iprefDictionary[i].upper_target = upper_target;
      iprefDictionary[i].valid = true;
   }

   for (uint8_t j=0; j<IPREF_UPPER_TARGET_DICTIONARY_SIZE; j++) iprefDictionary[j].lru++;
   iprefDictionary[i].lru = 0;

   return i;
}

// Update LRU states in dictionary on a read
static void iprefDictionaryUpdateLRUState (uint8_t read_index)
{
   for (uint8_t j=0; j<IPREF_UPPER_TARGET_DICTIONARY_SIZE; j++) iprefDictionary[j].lru++;
   iprefDictionary[read_index].lru = 0;
}

// Look up PTB to retrieve prefetch target and pattern history counter
static bool iprefPTBLookup (uint64_t signature, uint64_t ip, uint64_t *target, uint8_t *counter)
{
   uint32_t iprefPTBIndex = (signature & (NUM_IPREF_PTB_SETS - 1));
   uint64_t iprefPTBTag = ip & IPREF_TAG_MASK;
   uint32_t i;

   for (i=0; i<NUM_IPREF_PTB_WAYS; i++) {
      if (iprefPTB[iprefPTBIndex][i].valid && (iprefPTB[iprefPTBIndex][i].tag == iprefPTBTag)) {
         // Hit
         (*target) = (iprefDictionary[iprefPTB[iprefPTBIndex][i].upper_target_pointer].upper_target<<IPREF_TARGET_BITS) | iprefPTB[iprefPTBIndex][i].lower_target;
         iprefDictionaryUpdateLRUState (iprefPTB[iprefPTBIndex][i].upper_target_pointer);
         (*counter) = iprefPTB[iprefPTBIndex][i].pht_counter;
         iprefPTB[iprefPTBIndex][i].rrpv = 0;
         return true;
      }
   }

   // Miss
   return false;
}

// Allocate a new entry in PTB if not already present and taken
static void iprefPTBAllocate (uint64_t signature, uint64_t ip, uint64_t target, uint8_t taken)
{
   uint32_t iprefPTBIndex = (signature & (NUM_IPREF_PTB_SETS - 1));
   uint64_t iprefPTBTag = ip & IPREF_TAG_MASK;
   uint32_t i;

   for (i=0; i<NUM_IPREF_PTB_WAYS; i++) {
      if (iprefPTB[iprefPTBIndex][i].valid && (iprefPTB[iprefPTBIndex][i].tag == iprefPTBTag)) {
         // Hit
         if (taken) {
            iprefPTB[iprefPTBIndex][i].lower_target = target & IPREF_TARGET_MASK;
            iprefPTB[iprefPTBIndex][i].upper_target_pointer = iprefDictionaryAllocate(target >> IPREF_TARGET_BITS);
            if (iprefPTB[iprefPTBIndex][i].pht_counter < 3) iprefPTB[iprefPTBIndex][i].pht_counter++;
         }
         else {
            if (iprefPTB[iprefPTBIndex][i].pht_counter > 0) iprefPTB[iprefPTBIndex][i].pht_counter--;
         }
         iprefPTB[iprefPTBIndex][i].rrpv = 0;
         break;
      }
   }

   if (taken) {
      // Miss: replace and allocate
      if (i == NUM_IPREF_PTB_WAYS) {
         for (i=0; i<NUM_IPREF_PTB_WAYS; i++) {
            if (!iprefPTB[iprefPTBIndex][i].valid) break;
         }
         if (i == NUM_IPREF_PTB_WAYS) {
            for (i=0; i<NUM_IPREF_PTB_WAYS; i++) {
               if (!iprefPTB[iprefPTBIndex][i].pht_counter) break;
            }
            if (i == NUM_IPREF_PTB_WAYS) {
               while (1) {
                  for (i=0; i<NUM_IPREF_PTB_WAYS; i++) {
                     if (iprefPTB[iprefPTBIndex][i].rrpv == 3) break;
                  }
                  if (i == NUM_IPREF_PTB_WAYS) {
                     for (i=0; i<NUM_IPREF_PTB_WAYS; i++) iprefPTB[iprefPTBIndex][i].rrpv++;
                  }
                  else break;
               }
            }
         }
         assert(i < NUM_IPREF_PTB_WAYS);
         iprefPTB[iprefPTBIndex][i].tag = iprefPTBTag;
         iprefPTB[iprefPTBIndex][i].lower_target = target & IPREF_TARGET_MASK;
         iprefPTB[iprefPTBIndex][i].upper_target_pointer = iprefDictionaryAllocate(target >> IPREF_TARGET_BITS);
         iprefPTB[iprefPTBIndex][i].pht_counter = 2;	// weakly taken
         iprefPTB[iprefPTBIndex][i].valid = true;
         iprefPTB[iprefPTBIndex][i].rrpv = 2;
      }
   }
}

// Number of entries in the recent access filter
#define IPREF_FILTER_SIZE 13

// Recent access filter entry; size = 59 bits
class IPrefFilterEntry {
   public:
      bool     valid;	//  1 bit
      uint64_t tag;	// 58 bits
};

IPrefFilterEntry iprefFilter[IPREF_FILTER_SIZE];  // total size = 13x59 bits = 767 bits
uint8_t iprefFilterTailPtr;  // total size = 4 bits

// Search for a block address in the recent access filter
static bool iprefFilterLookup (uint64_t block_addr)
{
   for (int i=0; i<IPREF_FILTER_SIZE; i++) {
      if (iprefFilter[i].valid && (iprefFilter[i].tag == block_addr)) return true;
   }
   return false;
}

// Enqueue a new block address at the tail
static void iprefFilterInsert (uint64_t block_addr)
{
   iprefFilter[iprefFilterTailPtr].tag = block_addr;
   iprefFilter[iprefFilterTailPtr].valid = true;
   iprefFilterTailPtr = (iprefFilterTailPtr + 1) % (IPREF_FILTER_SIZE);
}

uint64_t ipref_last_ip_block;      	// total size = 58 bits
bool ipref_last_ip_block_valid;		// total size = 1 bit
uint32_t ipref_global_hist;		// Block-grain global history (11 bits)

void O3_CPU::l1i_prefetcher_initialize() 
{
	ipref_global_hist = 0;

        ipref_last_ip_block_valid = false;

	for (int i=0; i<NUM_IPREF_PTB_SETS; i++) for (int j=0; j<NUM_IPREF_PTB_WAYS; j++) iprefPTB[i][j].valid = false;
        for (int i=0; i<IPREF_FILTER_SIZE; i++) iprefFilter[i].valid = false;
	iprefFilterTailPtr = 0;
}

void O3_CPU::l1i_prefetcher_branch_operate(uint64_t ip, uint8_t branch_type, uint64_t branch_target)
{

}

void O3_CPU::l1i_prefetcher_cache_operate(uint64_t addr, uint8_t cache_hit, uint8_t prefetch_hit)
{
	uint8_t taken, counter;    			// total size = 1+2 = 3 bits
        bool ptbhit;  					// total size = 1 bit
	uint32_t spec_ghist;				// Speculative copy of block-grain global history (11 bits)
	uint64_t iprefTarget;				// Predicted prefetch target block (58 bits)
 	uint64_t ipref_ip_block, ipref_ip_block_copy;	// total size = 2x58 bits = 116 bits
	uint8_t ipref_pred;				// total size = 1 bit

	ipref_ip_block = addr >> LOG2_BLOCK_SIZE;       // current block address

	// Insert demand accesses into the recent access filter
	if (!iprefFilterLookup (ipref_ip_block)) iprefFilterInsert (ipref_ip_block);

	if (ipref_last_ip_block_valid) {
        	if (ipref_ip_block != ipref_last_ip_block) {
			if (ipref_ip_block == (ipref_last_ip_block + 1)) taken = 0;
			else taken = 1;

                        iprefPTBAllocate (ipref_global_hist^ipref_last_ip_block, ipref_last_ip_block, ipref_ip_block, taken);
                        ipref_global_hist = (ipref_global_hist << 1) | (taken & 0x1);
                        spec_ghist = ipref_global_hist;
                        ipref_ip_block_copy = ipref_ip_block;

			// Look ahead prefetch following the current block
                        for (uint8_t i=0; i<IPREF_LOOKAHEAD; i++) {
                           ipref_pred = 1;
                           ptbhit = iprefPTBLookup (spec_ghist ^ ipref_ip_block_copy, ipref_ip_block_copy, &iprefTarget, &counter);
                           if (!ptbhit || (counter < 2)) {
			      // Predicted not taken
                              ipref_pred = 0;
                              iprefTarget = ipref_ip_block_copy + 1;
                           }
                           spec_ghist = (spec_ghist << 1) | (ipref_pred & 0x1);
                           ipref_ip_block_copy = iprefTarget;
                           if (!iprefFilterLookup (iprefTarget) && ((i+1) >= IPREF_START_DEPTH)) {
                              prefetch_code_line(iprefTarget << LOG2_BLOCK_SIZE);
                              iprefFilterInsert (iprefTarget);
                           }
			}
			ipref_last_ip_block = ipref_ip_block;
        	}
	}
	else {
		ipref_last_ip_block = ipref_ip_block;
		ipref_last_ip_block_valid = true;
	}
}

void O3_CPU::l1i_prefetcher_cycle_operate()
{

}

void O3_CPU::l1i_prefetcher_cache_fill(uint64_t v_addr, uint32_t set, uint32_t way, uint8_t prefetch, uint64_t evicted_v_addr)
{

}

void O3_CPU::l1i_prefetcher_final_stats()
{

}
