
#
# Ported from LuceneBenchmark.java, which was written by Andrzej Bialecki
#
#  Preliminary Release Notes
#  -------------------------
#
#  This port doesn't seem to run to completion when
#  setUseCompoundFile(False) is called on the IndexWriter. Without compound
#  files enabled, it fails with a 'too many open files' error. To reproduce
#  the error, look for 'for compound in LuceneBenchmark.bools[n:]:' and
#  change the previous line to say 'n = 0'.
# 
#  Some searches return no hits, the logic had to be changed to accomodate
#  for that to avoid devide-by-zero errors. Maybe IndexWriter.maxFieldLength
#  needs to be set ? Maybe there is a bug ? Maybe the set of documents was
#  too small during testing ?
#
#  The resulting performance numbers were not compared with the Java
#  version or validated for accuracy.
#
#  To lengthen the benchmark, switch to a larger document source by
#  changing the sources attribute of the LuceneBenchmark class below.
# 
#  You may also increase RUN_COUNT or SCALE_UP, or increase the jumbo files.
#  Look for 'for k in xrange(11, 21):' and increase the second number.
#

import os, sys, tarfile, time

from cStringIO import StringIO
from urlparse import urlparse
from urllib import urlretrieve

from Streams import InputStreamReader, StringReader

from PyLucene import Analyzer, StandardAnalyzer, DateField, Document, Field
from PyLucene import IndexReader, IndexWriter, IndexSearcher
from PyLucene import QueryParser, Hits, Query, Directory, FSDirectory
from PyLucene import System, Runtime, DateFormat, Date


class LuceneBenchmark(object):
    
    bools = [ False, True ]
    queries = [ "file:src", "body:article", "body:article subject:re",
                "file:comp body:article* subject:re" ]
  
    # Number of iterations for each operation.
    RUN_COUNT = 1

    # Artificially scale up the number of source documents by this factor.
    SCALE_UP = 1

    LOG_STEP = 250
    SETUP_DONE = ".setup_done"

    # Default constructor - initialize the a list of URLs with source corpora.

    def __init__(self):
        
        self.workDir = None
        self.indexDir = None
        self.srcUrl = None
        self.sourceDir = None
        self.jumboDir = None
        self.setup_done = None

#        self.sources = [ "http://www-2.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.tar.gz", "http://people.csail.mit.edu/u/j/jrennie/public_html/20Newsgroups/20news-18828.tar.gz", "http://kdd.ics.uci.edu/databases/20newsgroups/mini_newsgroups.tar.gz" ]

        self.sources = [ "http://kdd.ics.uci.edu/databases/20newsgroups/mini_newsgroups.tar.gz" ]
  
    def setWorkDir(self, wd):
        
        if self.setup_done is not None:
            raise RuntimeError, "Too late - setup already done."

        self.workDir = wd

    # Download a source file from URL.
    # return local file, or None if failed

    def getSourceFile(self):

        self.pct = 0
        
        def hook(blockCount, blockSize, totalSize):
            if blockCount == 0:
                print '    reading', totalSize, 'bytes',
                sys.stdout.flush()
            elif totalSize > 0:
                if (float(blockCount *
                          blockSize) / totalSize) * 100 > self.pct + 10:
                    sys.stdout.write('.')
                    sys.stdout.flush()
                    self.pct += 10

        for source in self.sources:
            try:
                scheme, host, pathname, x, y, z = urlparse(source)
                filename = os.path.basename(pathname)
                print '    retrieving', source, 'to', filename
                filename, headers = urlretrieve(source, filename, hook)
                sys.stdout.write('\n')
                return filename
            except Exception, e:
                print e
                print '    failed'
                continue

    # Delete files and directories, even if non-empty.
    #  param dir file or directory
    #  return True on success, False if no or part of files have been deleted

    def fullyDelete(self, dir):

        if dir is None or not os.path.exists(dir):
            return False
        
        def purge(arg, path, names):
            for name in names:
                name = os.path.join(path, name)
                if os.path.isdir(name):
                    os.path.walk(name, purge, None)
                    os.rmdir(name)
                else:
                    os.remove(name)

        os.path.walk(dir, purge, None)
        os.rmdir(dir)

        return True
      
    # Make sure the sources are downloaded and unpacked, remove old indexes.
    # Prepare a set of large documents.

    def setup(self):

        if self.workDir is None:
            self.workDir = os.tempnam(None, ".lucene_benchmark")
            os.makedirs(self.workDir)

        print " - setup in", os.path.abspath(self.workDir)
        os.chdir(self.workDir)

        # reuse old setup
        self.setup_done = os.path.join(self.workDir, ".setup_done")
        self.indexDir = os.path.join(self.workDir, "index")
        self.sourceDir = os.path.join(self.workDir, "src")
        self.jumboDir = os.path.join(self.workDir, "jumbo", "jumbo")

        self.reset()
        if os.path.exists(self.setup_done):
            return

        src = None
        # check if one of the sources is downloaded
        for source in self.sources:
            scheme, host, pathname, x, y, z = urlparse(source)
            filename = os.path.basename(pathname)
            if os.path.exists(filename):
                src = filename
                break

        if src is None:
            src = self.getSourceFile()

        self.fullyDelete(self.sourceDir)
        os.makedirs(self.sourceDir)

        print "* Unpacking reference collection:", src,
        dircnt = 0
        fcnt = 0
        tar = tarfile.open(src, "r:gz")

        while True:
            member = tar.next()
            if member is None:
                break
            if fcnt % 100 == 0:
                sys.stdout.write('+')
                sys.stdout.flush()
            tar.extract(member, self.sourceDir)
            if member.isfile():
                fcnt += 1
            elif member.isdir():
                dircnt += 1

        tar.close()
        sys.stdout.write('\n')
        print " -", fcnt, "source files in", dircnt, "directories."
        print "* Creating jumbo files..."

        fcnt = 0
        dircnt = 0

        # concatenate
        self.fullyDelete(self.jumboDir)
        os.makedirs(self.jumboDir)

        groupDir = os.path.join(self.sourceDir, os.listdir(self.sourceDir)[0])
        groups = os.listdir(groupDir)
        for group in groups:
            outdir = os.path.join(self.jumboDir, group)
            os.makedirs(outdir)
            dircnt += 1
            print " - creating jumbo files in", outdir

            fileDir = os.path.join(groupDir, group)
            files = os.listdir(fileDir)
            for k in xrange(11, 21):
                output = file(os.path.join(outdir, str(k)), "w")
                for m in xrange(0, k):
                    input = file(os.path.join(fileDir, files[m]))
                    data = input.read()
                    input.close()
                    output.write(data)
                    fcnt += 1
                output.close()
        print " -", fcnt, "jumbo files in", dircnt, "directories."

        # create the "done" file
        file(self.setup_done, "w").close()

    # Remove existing index.

    def reset(self):

        if os.path.exists(self.indexDir):
            self.fullyDelete(self.indexDir)
        os.makedirs(self.indexDir)

    # Remove index and unpacked source files. You have to run setup() after
    # you run this method.

    def clean(self):

        self.reset()
        self.fullyDelete(self.jumboDir)
        self.fullyDelete(self.sourceDir)
        os.remove(self.setup_done)

    # Assume the input is an NNTP message, where the header is separated from
    # the body with blank line. <br>
    # Extract basic metadata from the header: "From:", "Subject:", "Date:". If
    # more fields are needed, the first couple of lines from the body will be
    # converted into additional fields. <br>
    # NOTE: this method doesn't even pretend to be an RFC-compliant parser, so
    # don't expect any MIME or transport decoding or similar.
    # 
    #  param in input file
    #  param addFields if greater than 0, add more fields named "line0",
    #        "line1", "line2", etc, with the content made from the body text.
    #  return Lucene document

    def makeDocument(self, filename, addFields, tags, stored, tokenized, tfv):

        doc = Document()

        # tag this document
        if tags:
            i = 0
            for tag in tags:
                doc.add(Field("%s%d" %(tag, i), tag,
                              stored, True, tokenized, tfv))
                i += 1

        doc.add(Field("file", os.path.abspath(filename),
                      stored, True, tokenized, tfv))

        header = []
        body = []

        input = file(filename)
        reader = InputStreamReader(input, 'iso-8859-1')
        
        inHeader = True
        while True:
            line = reader.readline()
            if line == '':
                break
            if inHeader:
                if line.strip() == '':
                    inHeader = False
                    continue
                header.append(line)
            else:
                body.append(line)

        reader.close()

        for line in header:
            if line.startswith("From: "):
                doc.add(Field("from", line[6:],
                              stored, True, tokenized, tfv))
            elif line.startswith("Subject: "):
                doc.add(Field("subject", line[9:],
                              stored, True, tokenized, tfv))
            elif line.startswith("Date: "):
                # parse date
                val = None
                date = line[6:].strip()
                if ',' in date:
                    format = '%a, %d %b %Y %H:%M:%S %Z'
                else:
                    format = '%a %d %b %Y %H:%M:%S %Z'
                try:
                    t = long(time.mktime(time.strptime(date, format))) * 1000
                    val = DateField.timeToString(t)
                except:
                    # print " -", filename, ": bad date '%s'" %(date)
                    val = DateField.timeToString(System.currentTimeMillis())
                doc.add(Field("date", val, stored, True, False, False))

        # if additional fields are needed, add them here
        for i in xrange(addFields, len(body)):
            doc.add(Field("line%d" %(i), body[i], stored, True, tokenized, tfv))
        
        # add body
        doc.add(Field("body", u'\n'.join(body), stored, True, tokenized, tfv))

        return doc

    # Make index, and collect time data.
    #  param trd run data to populate
    #  param srcDir directory with source files
    #  param iw index writer, already open
    #  param addFields fields to add to each document (see makeDocument method)
    #  param stored store values of fields
    #  param tokenized tokenize fields
    #  param tfv store term vectors

    def makeIndex(self, trd, srcDir, iw, addFields, stored, tokenized, tfv):

        groupDir = os.path.join(srcDir, os.listdir(srcDir)[0])
        groups = os.listdir(groupDir)

        doc = None
        cnt = 0L

        td = TimeData()
        td.name = "addDocument"
        
        for s in xrange(0, self.SCALE_UP):
            tags = [ "%s/%d" %(srcDir, s) ]
            for group in groups:
                fileDir = os.path.join(groupDir, group)
                files = os.listdir(fileDir)
                for file in files:
                    doc = self.makeDocument(os.path.join(fileDir, file),
                                            addFields, tags,
                                            stored, tokenized, tfv)
                    td.start()
                    iw.addDocument(doc)
                    td.stop()
                    cnt += 1
                    if cnt % self.LOG_STEP == 0:
                        print " - processed", cnt, ", run id=", trd.id
                    trd.addData(td)
                    td.reset()

        trd.addData(td)

    # Run benchmark using supplied parameters.
    #  param params benchmark parameters

    def runBenchmark(self, params):
        
        for i in xrange(0, self.RUN_COUNT):
            trd = TestRunData()
            trd.startRun()
            trd.id = str(i)
            iw = IndexWriter(params.dir, params.analyzer, True)
            iw.mergeFactor = params.mergeFactor
            iw.minMergeDocs = params.minMergeDocs
            iw.setUseCompoundFile(params.compound)
            self.makeIndex(trd, params.source, iw, 5, True, True, False)

            if params.optimize:
                td = TimeData("optimize")
                trd.addData(td)
                td.start()
                iw.optimize()
                td.stop()
                trd.addData(td)

            iw.close()

            if params.queries is not None:
                ir = None
                searcher = None
                for qd in params.queries:
                    if ir is not None and qd.reopen:
                        searcher.close()
                        ir.close()
                        ir = None
                        searcher = None

                    if ir is None:
                        ir = IndexReader.open(params.dir)
                        searcher = IndexSearcher(ir)

                    doc = None
                    if qd.warmup:
                        td = TimeData(qd.id + "-warm")
                        for m in xrange(0, ir.maxDoc()):
                            td.start()
                            if ir.isDeleted(m):
                                td.stop()
                                continue

                            doc = ir.document(m)
                            td.stop()

                        trd.addData(td)

                    td = TimeData(qd.id + "-srch")
                    td.start()
                    h = searcher.search(qd.q)
                    td.stop()
                    if h is not None and len(h) == 0:
                        print qd.q, 'returned no hits'

                    trd.addData(td)
                    td = TimeData(qd.id + "-trav")
                    if h is not None and len(h) > 0:
                        for m in xrange(0, len(h)):
                            td.start()
                            id = h.id(m)
                            if qd.retrieve:
                                doc = ir.document(id)
                            td.stop()

                    trd.addData(td)

                try:
                    if searcher is not None:
                        searcher.close()
                except:
                    pass
                
                try:
                    if ir is not None:
                        ir.close()
                except:
                    pass

            trd.endRun()
            params.runData.append(trd)

     # Optional argument points to the output directory for the test.
     #  param args

    def main(cls, args):
      
        bench = LuceneBenchmark()
        if args:
            bench.setWorkDir(args[0])
        bench.setup()

        a = StandardAnalyzer()
        qs = cls.createQueries(LuceneBenchmark.queries, a)

        # Here you can limit the set of query benchmarks
        qds = QueryData.getAll(qs)

        # Here you can narrow down the set of test parameters
        params = TestData.getAll([ bench.sourceDir, bench.jumboDir ],
                                 [ a ])

        for param in params:
            try:
                bench.reset()
                param.dir = FSDirectory.getDirectory(bench.indexDir, True)
                param.queries = qds
                print param
                bench.runBenchmark(param)

                # Here you can collect and output the runData for
                # further processing.
                print param.showRunData(param.id)

                #bench.runSearchBenchmark(queries, dir)
                param.dir.close()
                System.runFinalization()
                System.gc()
            except Exception, e:
                raise
                #print "EXCEPTION: ", e
                #break

    main = classmethod(main)

    # Parse the strings containing Lucene queries.
    #  param qs array of strings containing query expressions
    #  param a analyzer to use when parsing queries
    #  return array of Lucene queries

    def createQueries(cls, qs, a):

        qp = QueryParser("body", a)
        queries = []
        for _qs in qs:
            try:
                q = qp.parseQuery(_qs)
                queries.append(q)
            except Exception, e:
                raise
                #print e

        return queries

    createQueries = classmethod(createQueries)
    

# This class holds parameters for a query benchmark.

class QueryData(object):

    def __init__(self):

        # Benchmark id
        self.id = None

        # Lucene query
        self.q = None

        # If true, re-open index reader before benchmark.
        self.reopen = False
        
        # If true, warm-up the index reader before searching by sequentially
        # retrieving all documents from index.
        self.warmup = False

        # If true, actually retrieve documents returned in Hits.
        self.retrieve = False
  
    # Prepare a list of benchmark data, using all possible combinations of
    # benchmark parameters.
    #  param queries source Lucene queries

    def getAll(cls, queries):

        vqd = []
        i = 0
        for query in queries:
            for r in xrange(1, -1, -1):
                for w in xrange(1, -1, -1):
                    for t in xrange(0, 2):
                        qd = QueryData()
                        qd.id = "qd-%d%d%d%d" %(i, r, w, t)
                        qd.reopen = LuceneBenchmark.bools[r]
                        qd.warmup = LuceneBenchmark.bools[w]
                        qd.retrieve = LuceneBenchmark.bools[t]
                        qd.q = queries[i]

                        vqd.append(qd)
            i += 1

        return vqd

    getAll = classmethod(getAll)
  
    # Short legend for interpreting toString() output. */
    def getLabels(cls):
        return "# Query data: R-reopen, W-warmup, T-retrieve, N-no"

    getLabels = classmethod(getLabels)

    def __str__(self):
        return "id %s %s %s [%s]" %(self.reopen and "R" or "NR",
                                    self.warmup and "W" or "NW",
                                    self.retrieve and "T" or "NT", self.q)


# This class holds a data point measuring speed of processing.

class TimeData(object):

    def __init__(self, name=None):
        
        # Name of the data point - usually one of a data series
        # with the same name
        self.name = name
    
        # Number of records processed.
        self.count = 0

        # Elapsed time in floating point seconds.
        self.elapsed = 0.0
  
        self.delta = 0.0

        # Free memory at the end of measurement interval.
        self.freeMem = 0L

        # Total memory at the end of measurement interval.
        self.totalMem = 0L

    # Start counting elapsed time.

    def start(self):
        self.delta = time.time()

    # Stop counting elapsed time.

    def stop(self):

        self.count += 1
        self.elapsed += time.time() - self.delta

    # Record memory usage.

    def recordMemUsage(self):

        runtime = Runtime.getRuntime()
        self.freeMem = runtime.freeMemory()
        self.totalMem = runtime.totalMemory()


    # Reset counters.

    def reset(self):

        self.count = 0
        self.elapsed = 0L
        self.delta = 0L

    def clone(self):

        td = TimeData(self.name)
        td.elapsed = self.elapsed
        td.count = self.count
        td.delta = self.delta
        td.freeMem = self.freeMem
        td.totalMem = self.totalMem

        return td
  
    # Get rate of processing, defined as number of processed records per second.

    def getRate(self):

        try:
            return self.count / self.elapsed
        except ZeroDivisionError:
            print self.name, 'count:', self.count, 'elapsed:', self.elapsed
            return 0

    # Get a short legend for toString() output.

    def getLabels(cls):
        return "# count\telapsed\trec/s\tfreeMem\ttotalMem"

    getLabels = classmethod(getLabels)
  
    # Return a tab-seprated string containing this data.
    #  param withMem if true, append also memory information

    def __str__(self, withMem=None):

        if withMem is not None:
            s = "%d\t%d\t%f\t" %(self.count, self.elapsed, self.getRate())
            if withMem == True:
                s += "\t%d\t%d" %(self.freeMem, self.totalMem)
            return s
        
        return 'True'

# This class holds series of TimeData related to a single test run. TimeData
# values may contribute to different measurements, so this class provides also
# some useful methods to separate them.

class TestRunData(object):

    def __init__(self, id=None):
        
        self.id = id

        # Start and end time of this test run.
        self.start = 0L
        self.end = 0L

        self.data = {}

    # Mark the starting time of this test run.
    def startRun(self):
        self.start = time.time()

    # Mark the ending time of this test run.
    def endRun(self):
        self.end = time.time()

    # Add a data point.
    def addData(self, td):

        td.recordMemUsage()
        try:
            v = self.data[td.name]
        except KeyError:
            v = []
            self.data[td.name] = v

        v.append(td.clone())

    # Get a list of all available types of data points.
    def getLabels(self):
        return self.data.iterkeys()

    # Get total values from all data points of a given type.

    def getTotal(self, label):

        try:
            v = self.data[label]
        except KeyError:
            return None

        res = TimeData("TOTAL " + label)
        for td in v:
            res.count += td.count
            res.elapsed += td.elapsed

        return res

    # Get total values from all data points of all types.
    #  return a list of TimeData values for all types.
    def getTotals(self):
        return [ self.getTotal(label) for label in self.getLabels() ]

    # Get memory usage stats. for a given data type.

    def getMemUsage(self, label):

        try:
            v = self.data[label]
        except KeyError:
            return None

        res = MemUsage()
        res.minFree = (1L << 63) - 1
        res.minTotal = (1L << 63) - 1
        avgFree = 0L
        avgTotal = 0L
        for td in v:
            if res.maxFree < td.freeMem:
                res.maxFree = td.freeMem
            if res.maxTotal < td.totalMem:
                res.maxTotal = td.totalMem
            if res.minFree > td.freeMem:
                res.minFree = td.freeMem
            if res.minTotal > td.totalMem:
                res.minTotal = td.totalMem
            avgFree += td.freeMem
            avgTotal += td.totalMem

        res.avgFree = avgFree / len(v)
        res.avgTotal = avgTotal / len(v)

        return res

    def __str__(self):

        return '\n'.join(["%s-%s %s %s" %(self.id, label,
                                          self.getTotal(label).__str__(False),
                                          self.getMemUsage(label).toScaledString(1024 * 1024, "MB"))
                          for label in self.getLabels()])

# This class holds a set of memory usage values.
class MemUsage(object):

    def __init__(self):

        self.maxFree = 0L
        self.minFree = 0L
        self.avgFree = 0L

        self.maxTotal = 0L
        self.minTotal = 0L
        self.avgTotal = 0L

    def __str__(self):
        selftoScaledString(1, "B")

    # Scale down the values by divisor, append the unit string.
    
    def toScaledString(self, div, unit):

        return "free=%d/%d/%d %s, total=%d/%d/%d %s" %(self.minFree / div,
                                                       self.avgFree / div,
                                                       self.maxFree / div,
                                                       unit,
                                                       self.minTotal / div,
                                                       self.avgTotal / div,
                                                       self.maxTotal / div,
                                                       unit)

# This class holds together all parameters related to a test. Single test is
# performed several times, and all results are averaged.

class TestData(object):

    MINMERGE_COUNTS = [ 10, 20, 50, 100, 200, 500 ]
    MERGEFACTOR_COUNTS = [ 10, 20, 50, 100, 200, 500 ]

    def __init__(self):
        
        # ID of this test data.
        self.id = None

        # Heap size.
        self.heap = Runtime.getRuntime().maxMemory()

        # List of results for each test run with these parameters.
        self.runData = []

        self.minMergeDocs = 0
        self.mergeFactor = 0
        
        # Directory containing source files.
        self.source = None

        # Lucene Directory implementation for creating an index.
        self.dir = None

        # Analyzer to use when adding documents.

        self.analyzer = None

        # If true, use compound file format.
        self.compound = False
  
        # If true, optimize index when finished adding documents.
        self.optimize = False

        # Data for search benchmarks.
        self.queries = None

    # Get a textual summary of the benchmark results, average from test runs.

    def showRunData(self, prefix):

        class DCounter(object):
            def __init__(self):
                self.total = 0
                self.count = 0
                self.recordCount = 0
    
        class LCounter(object):
            def __init__(self):
                self.total = 0
                self.count = 0
  
        if len(self.runData) == 0:
            return "# [NO RUN DATA]"

        sb = StringIO()
        sb.write("# testData id\toperation\trunCnt\trecCnt\trec/s\tavgFreeMem\tavgTotalMem\n")

        mapMem = {}
        mapSpeed = {}

        for trd in self.runData:
            for label in trd.getLabels():
                mem = trd.getMemUsage(label)
                if mem is not None:
                    try:
                        tm = mapMem[label]
                    except KeyError:
                        tm = [ LCounter(), LCounter() ]
                        mapMem[label] = tm
                    tm[0].total += mem.avgFree
                    tm[0].count += 1
                    tm[1].total += mem.avgTotal
                    tm[1].count += 1
        
                td = trd.getTotal(label)
                if td is not None:
                    try:
                        dc = mapSpeed[label]
                    except KeyError:
                        dc = DCounter()
                        mapSpeed[label] = dc
                    dc.count += 1
                    dc.total += td.getRate()
                    dc.recordCount += td.count

        res = {}
        for label, dc in mapSpeed.iteritems():
            res[label] = "%d\t%d\t%f" %(dc.count,
                                        dc.recordCount / dc.count,
                                        dc.total / float(dc.count))

        for label, lc in mapMem.iteritems():
            try:
                speed = res[label]
            except KeyError:
                speed = "%d\0.0" %(lc[0].count)

            res[label] = "%s\t%s\t%s" %(speed,
                                        lc[0].total / lc[0].count,
                                        lc[1].total / lc[1].count)

        for label, r in res.iteritems():
            sb.write("%s\t%s\t%s\n" %(prefix, label, r))

        value = sb.getvalue()
        sb.close()

        return value
  
    # Prepare a list of benchmark data, using all possible combinations of
    # benchmark parameters.
    #  param sources list of directories containing different source document
    #  collections
    #  param list of analyzers to use.

    def getAll(cls, sources, analyzers):

        res = []
        ref = TestData()

        q = 0
        for analyzer in analyzers:
            m = 0
            for source in sources:
                i = 0
                for minMergeDocs in TestData.MINMERGE_COUNTS:
                    k = 0
                    for mergeFactor in TestData.MERGEFACTOR_COUNTS:
                        n = 1
                        for compound in LuceneBenchmark.bools[n:]:
                            p = 0
                            for optimize in LuceneBenchmark.bools:
                                ref.id = "td-%d%d%d%d%d%d" %(q, m, i, k, n, p)
                                ref.source = source
                                ref.analyzer = analyzer
                                ref.minMergeDocs = minMergeDocs
                                ref.mergeFactor = mergeFactor
                                ref.compound = compound
                                ref.optimize = optimize
                                res.append(ref.clone())
                                p += 1
                            n += 1
                        k += 1
                    i += 1
                m += 1
            q += 1

        return res

    getAll = classmethod(getAll)

    def clone(self):

        cl = TestData()
        cl.id = self.id
        cl.compound = self.compound
        cl.heap = self.heap
        cl.mergeFactor = self.mergeFactor
        cl.minMergeDocs = self.minMergeDocs
        cl.optimize = self.optimize
        cl.source = self.source
        cl.dir = self.dir
        cl.analyzer = self.analyzer
        # don't clone runData
        return cl

    def __str__(self):

        res = StringIO()
        res.write("#-- ID: %s, %s heap=%d --\n" %(self.id,
                                                  DateFormat.getInstance().format(Date()),
                                                  self.heap))
        res.write("# source=%s, dir=%s\n" %(self.source, self.dir))
        res.write("# minMergeDocs=%d, mergeFactor=%d" %(self.minMergeDocs,
                                                        self.mergeFactor))
        res.write(", compound=%d, optimize=%d\n" %(self.compound,
                                                   self.optimize))
        if self.queries is not None:
            res.write(QueryData.getLabels())
            res.write('\n')
            for qd in self.queries:
                res.write("# %s\n" %(qd))

        value = res.getvalue()
        res.close()
        
        return value


if __name__ == "__main__":
    LuceneBenchmark.main(sys.argv[1:])
