///###////////////////////////////////////////////////////////////////////////
//
// Burton Computer Corporation
// http://www.burton-computer.com
// $Id: FrequencyDBImpl_hash.cc,v 1.13 2003/08/31 00:11:57 bburton Exp $
//
// Copyright (C) 2000 Burton Computer Corporation
// ALL RIGHTS RESERVED
//
// This program is open source software; you can redistribute it
// and/or modify it under the terms of the Q Public License (QPL)
// version 1.0. Use of this software in whole or in part, including
// linking it (modified or unmodified) into other programs is
// subject to the terms of the QPL.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// Q Public License for more details.
//
// You should have received a copy of the Q Public License
// along with this program; see the file LICENSE.txt.  If not, visit
// the Burton Computer Corporation or CoolDevTools web site
// QPL pages at:
//
//    http://www.burton-computer.com/qpl.html
//

#ifdef USE_MMAP

#include <stdio.h>
#include <unistd.h>
#include <fcntl.h>
#include <errno.h>
#include <sys/mman.h>
#include "hash.h"
#include "LockFD.h"
#include "WordData.h"
#include "FrequencyDBImpl_hash.h"

enum {
  SIZE_MULTIPLE = 1024,
  INDEX_OFFSET = 5,
  COUNT_INDEX = 0,
};

enum {
  BIT_MASK_MIN_BITS = 34,   // min size using bits (17 bits)
  BIT_MASK_MAX_BITS = 63,   // max size using bits (31.5 bits)
  DEBUG_HASH = 0,
};

FrequencyDBImpl *FrequencyDBImpl_hash::factory()
{
  return new FrequencyDBImpl_hash();
}

FrequencyDBImpl_hash::FrequencyDBImpl_hash(int _size)
  : m_cursor(0),
    m_size(_size * SIZE_MULTIPLE),
    m_base(0)
{
  setSize(_size);
}

void FrequencyDBImpl_hash::setSize(int _size)
{
  if (_size < BIT_MASK_MIN_BITS) {
    // sanity test
    _size = BIT_MASK_MIN_BITS;
  }

  if (_size <= BIT_MASK_MAX_BITS) {
    m_maskBits = _size >> 1;
    m_size = hashsize(m_maskBits);
    if (_size & 1) {
      m_addHalfBits = true;
      m_size += hashsize(m_maskBits - 1);
    } else {
      m_addHalfBits = false;
    }
    m_size += INDEX_OFFSET;
  } else {
    m_maskBits = 0;
    m_addHalfBits = false;
    if (_size < SIZE_MULTIPLE) {
      m_size = _size * SIZE_MULTIPLE;
    } else {
      m_size = _size;
    }
  }
}

FrequencyDBImpl_hash::~FrequencyDBImpl_hash()
{
  close();
}

bool FrequencyDBImpl_hash::open(const string &arg_filename,
                                bool read_only,
                                int create_mode)
{
  close();

  if (is_debug) {
    cerr << "OPEN DATABASE " << arg_filename << endl;
  }

  string filename(arg_filename);
  string::size_type colon_pos = filename.find(':');
  if (colon_pos != string::npos) {
    int size = m_size;
    sscanf(filename.c_str(), "%d:", &size);
    if (size > 0) {
      setSize(size);
    }
    filename.erase(0, colon_pos + 1);
  }

  int flags = (read_only) ? O_RDONLY : O_RDWR;
  File data_file(filename);
  bool exists = data_file.isFile();
  if (!exists) {
    flags |= O_CREAT;
  }

  if (exists) {
    unsigned long file_size = data_file.getSize();
    if ((file_size % WordArray::ENTRY_SIZE) != 0) {
      cerr << "error: hash file "
           << filename
           << " size not a multiple of "
           << WordArray::ENTRY_SIZE
           << " bytes"
           << endl;
      return false;
    }
    m_size = file_size / WordArray::ENTRY_SIZE;
  }

  if (is_debug) {
    cerr << "HASH DATABASE " << filename << " SIZE " << m_size << endl;
  }

  int fd = ::open(filename.c_str(), flags, create_mode);
  if (fd < 0) {
    cerr << "error: unable to open database " << filename << ": " << strerror(errno) << endl;
    return false;
  }

  if (is_debug) {
    cerr << "LOCKING DATABASE " << filename << endl;
  }

  m_lock.set(new LockFD(fd));
  m_lock->lock(read_only ? LockFD::SHARED_LOCK : LockFD::EXCLUSIVE_LOCK);

  if (is_debug) {
    cerr << "LOCKED DATABASE " << filename << endl;
  }

  if (!exists) {
    if (is_debug) {
      cerr << "CREATING HASH FILE " << filename << endl;
    }

    char zeros[SIZE_MULTIPLE * WordArray::ENTRY_SIZE];
    memset(zeros, 0, sizeof(zeros));

    for (int i = 0; i < m_size; i += SIZE_MULTIPLE) {
      ::write(fd, &zeros, min(m_size - i, (int)SIZE_MULTIPLE) * WordArray::ENTRY_SIZE);
    }
  }

  if (is_debug) {
    cerr << "MMAPPING HASH FILE " << filename << endl;
  }
  flags = (read_only) ? PROT_READ : (PROT_READ | PROT_WRITE);
  m_base = (char *)mmap(0, m_size * WordArray::ENTRY_SIZE, flags, MAP_SHARED, fd, 0);
  if (m_base == (char *)-1) {
    cerr << "error: unable to mmap file " << filename << ": " << strerror(errno) << endl;
    m_base = 0;
    close();
    return false;
  }

  m_array.reset(m_base, m_size);
  return true;
}

void FrequencyDBImpl_hash::close()
{
  m_cursor = 0;
  if (m_base) {
    ::munmap(m_base, m_size * WordArray::ENTRY_SIZE);
    m_array.reset(0, 0);
    m_base = 0;
  }

  if (m_lock.get()) {
    ::close(m_lock->getFD());
    m_lock.clear();
  }
}

void FrequencyDBImpl_hash::flush()
{
}

inline int hash_string(const string &str)
{
  return (int)(0x7fffffff & jenkins_hash((const ub1 *)str.c_str(), str.length(), 0));
}

void FrequencyDBImpl_hash::writeWord(const string &word,
                                     const WordData &counts)
{
  assert(m_lock.get());
  assert(m_base);

  int index = getIndexForWord(word);
  m_array.writeWord(index, counts);
}

bool FrequencyDBImpl_hash::readWord(const string &word,
                                    WordData &counts)
{
  assert(m_lock.get());
  assert(m_base);

  int index = getIndexForWord(word);
  m_array.readWord(index, counts);
  return counts.totalCount() != 0;
}

string FrequencyDBImpl_hash::getWordForIndex(int index)
{
  if (index == COUNT_INDEX) {
    return FrequencyDB::COUNT_WORD;;
  }

  char buffer[128];
  sprintf(buffer, "I0x%08lx", index);
  return buffer;
}

int FrequencyDBImpl_hash::getIndexForWord(const string &word)
{
  if (word == FrequencyDB::COUNT_WORD) {
    return COUNT_INDEX;
  }

  int index = 0;
  if (starts_with(word, "I0x")) {
    sscanf(word.c_str() + 3, "%x", &index);
  } else {
    int hash = hash_string(word);
    if (m_maskBits > 0) {
      if (DEBUG_HASH && is_debug) {
        cerr << "ORIGINAL HASH 0x" << hex << hash << endl;
        cerr << "HASH MASK     0x" << hex << (hashmask(m_maskBits)) << endl;
      }

      bool add_extra = (hash >> m_maskBits + 1) & 1;
      if (DEBUG_HASH && is_debug) {
        cerr << "add_extra = " << add_extra << endl;
      }

      // add in extra bits we'll be masking off so they can have some impact
      hash = hash ^ (hash >> m_maskBits);
      if (DEBUG_HASH && is_debug) {
        cerr << "ADJUSTED HASH 0x" << hex << hash << endl;
      }

      hash = hash & hashmask(m_maskBits);
      if (DEBUG_HASH && is_debug) {
        cerr << "MASKED   HASH 0x" << hex << hash << endl;
      }

      if (m_addHalfBits && add_extra) {
        index = hash + (hash >> 1);
      } else {
        index = hash;
      }

      if (DEBUG_HASH && is_debug) {
        cerr << "FINAL    HASH 0x" << hex << index << endl;
      }

      index += INDEX_OFFSET;
    } else {
      index = INDEX_OFFSET + (hash % (m_size - INDEX_OFFSET));
    }
  }
  return index;
}

bool FrequencyDBImpl_hash::firstWord(string &word,
                                     WordData &counts)
{
  assert(m_lock.get());
  assert(m_base);

  word = FrequencyDB::COUNT_WORD;
  m_array.readWord(COUNT_INDEX, counts);
  m_cursor = INDEX_OFFSET;
  return true;
}

bool FrequencyDBImpl_hash::nextWord(string &word,
                                    WordData &counts)
{
  assert(m_lock.get());
  assert(m_base);

  for (; m_cursor < m_size; ++m_cursor) {
    m_array.readWord(m_cursor, counts);
    if (counts.totalCount() > 0) {
      word = getWordForIndex(m_cursor);
      ++m_cursor;
      return true;
    }
  }

  return false;
}

string FrequencyDBImpl_hash::getDatabaseType() const
{
  return "Hashed-array";
}

void FrequencyDBImpl_hash::sweepOutOldTerms(int junk_count,
                                            int max_age)
{
  assert(m_lock.get());
  assert(m_base);

  WordData counts;
  int i = INDEX_OFFSET;
  for (; i < m_size; ++i) {
    m_array.readWord(i, counts);
    if (counts.totalCount() > 0 && counts.totalCount() <= junk_count && counts.age() > max_age) {
      if (is_debug) {
        cerr << "sweepOutJunk: removing term " << getWordForIndex(i)
             << " with total count " << counts.totalCount()
             << " and age " << counts.age()
             << endl;
      }
      counts.clear();
      m_array.writeWord(i, counts);
    }
  }
}

bool FrequencyDBImpl_hash::canCacheTerm(const string &word)
{
  return false;
}

#endif // USE_MMAP
