File: timeIO.py

"""
test various read and write file I/O modes for speed
in the version of Python that is running this script;
runs most common and valid read/write coding patterns;
tests ascii and binary, but not wide-char unicode files;
printed results can be parsed later for comparisons
"""


######################################################################
# generic timer
######################################################################

import time

def timeOnce(func, *args):
    start = time.clock()
    func(*args)                                 # ignore any return value
    return time.clock() - start

def timerAvg(func, *args):
    warmcache = timeOnce(func, *args)
    reps   = 3
    runavg = 0                                  # take average of 3 runs
    for i in range(reps):
        runavg += timeOnce(func, *args)
    return runavg / reps                        # CHANGED: take low = "best"


# The following may be a bit better, but isn't directly comparable

def timerBest(func, *args):
    warmcache = timeOnce(func, *args)           # make sure disk caches active
    reps = 3
    runs = []                                   # take min of N runs
    for i in range(reps):
        runs.append(timeOnce(func, *args))
    return min(runs)

timer = timerBest                               # CHANGED


######################################################################
# file read tests
######################################################################

#=====================================================================
# all of the following are probably valid use cases for 2.6 and 3.0,
# though lines/text and blocks/binary combos seem more typical in 3.0
# (programs will pick str xor bytes for text or binary data in 3.0); 
# truly binary files can only be read in binary mode in 3.0, because 
# they cannot be decoded into characters in text mode, and it makes 
# no sense to read truly binary files by lines: they have no lineends;
# the allAtOnce modes may fail for pathologically large files;
#
# 3.0 has str + bytes; 2.6 has just str, plus binary files
# open mode default = 'r' = 'rt' in 3.0, and 'r' in 2.6
# (both mean text mode input when the mode argument is omitted)
#=====================================================================


blocksize = 1024 * 32


def read_byLines_textMode(filename):          
    for line in open(filename):               # 2.6 text mode returns str, does not decode (use codecs.open)
        pass                                  # 3.0 text mode returns str, after decoding content

def read_byLines_binaryMode(filename):        # less common in 3.0?
    for line in open(filename, 'rb'):         # 2.6 binary mode returns str,   does not decode
        pass                                  # 3.0 binary mode returns bytes, does not decode


def read_byBlocks_textMode(filename, size=blocksize):
    f = open(filename)
    while True:                                # less common in 3.0?
        block = f.read(size)
        if not block: break

def read_byBlocks_binaryMode(filename, size=blocksize):
    f = open(filename, 'rb')
    while True:
        block = f.read(size)
        if not block: break


def read_allAtOnce_textMode(filename):         # not for very large files
    text = open(filename).read()              

def read_allAtOnce_binaryMode(filename):       # not for very large files
    text = open(filename, 'rb').read()


######################################################################
# file write tests
######################################################################

#=====================================================================
# all the following work, but tests "write_byLines_binaryMode" and
# "write_byBlocks_textMode" are probably invalid use cases for 3.0,
# where programs are more likely to pick str xor bytes for text
# or binary data, and not convert to str or bytes just to write in 
# text or binary mode; portability issues: 3.0's encoding arg required 
# by 3.0's bytes() converter is not allowed in 2.6's bytes(), and 
# 2.6's str.decode() creates a unicode object which adds some cost;
#
# hoist set-up ops out to avoid charging to test funcs
# 'xx' / b'xx' are str / bytes in 3.0, both are str in 2.6
# 'xx' == b'xx' and bytes(x) == str(X) in 2.6
# 2.6: str is a seq of bytes, unicode a distinct type
# 3.0: str is seq of Unicode chars, bytes is seq of ints
#=====================================================================


oneMeg   = 1024 * 1024
halfMeg  = oneMeg // 2               # use truncating division in both 2.6 and 3.0
repsList = list(range(halfMeg))      # force list in both 2.6 and 3.0

aLine    = '*' * 49 + '\n'           # 25M in file ((50+\r?) * ((1024 * 1024) / 2))
aBlock   = b'1\x0234\x05' * 10       # 25M in file ((5 * 10) * (1M / 2))
aFileStr = aLine  * halfMeg          # 25M characters
aFileBin = aBlock * halfMeg          # 25M bytes

print ('\nOutput data sizes: %s %s %s %s %s' % 
      (len(repsList), len(aLine), len(aBlock), len(aFileStr), len(aFileBin)) )


def write_byLines_textMode(filename):       # writing by blocks in text mode is similar
    file = open(filename, 'w')              # 3.0 text mode takes str, encodes content, xlates newlines 
    for i in repsList:                      # 2.6 text mode takes str, xlates newlines
        file.write(aLine)                   # 3.0 text mode takes open() flag to control lineends
    file.close()                            

def write_byLines_binaryMode(filename):     # less common in 3.0?
    file = open(filename, 'wb')             # 3.0 binary mode takes bytes, does not decode or xlate
    for i in repsList:                      # 2.6 binary mode takes str,   does not xlate newlines
        file.write(aLine.encode())          # encode() makes bytes in 3.0, same str in 2.6
    file.close()                            


def write_byBlocks_textMode(filename):      # less common in 3.0?
    file = open(filename, 'w')              # decode() makes str in 3.0, unicode in 2.6    
    for i in repsList:                      
        file.write(aBlock.decode())         
    file.close()

def write_byBlocks_binaryMode(filename):    # writing by lines in binary mode is similar
    file = open(filename, 'wb')             
    for i in repsList:                      
        file.write(aBlock)                  
    file.close()                
                                            
                                            
def write_allAtOnce_textMode(filename):     # not for very large files 
    open(filename, 'w').write(aFileStr)

def write_allAtOnce_binaryMode(filename):   # not for very large files
    open(filename, 'wb').write(aFileBin) 


######################################################################
# run, collect test data for Python running me
######################################################################

def timePython():
    import sys, os
    outputfile = 'timeIO.out'                      # hard-code: I create this
    textfile, binaryfile = sys.argv[1:3]           # input files vary, command line 

    tests = {textfile:   (read_byLines_textMode,   
                          read_byLines_binaryMode,     # less common in 3.0?
                          read_byBlocks_textMode,      # less common in 3.0?
                          read_byBlocks_binaryMode,
                          read_allAtOnce_textMode,     # not for very large files
                          read_allAtOnce_binaryMode),  # not for very large files

             binaryfile: (read_byBlocks_binaryMode,    # other read modes not valid,
                          read_allAtOnce_binaryMode),  # for truly binary data files

             outputfile: (write_byLines_textMode,
                          write_byLines_binaryMode,    # less common in 3.0?
                          write_byBlocks_textMode,     # less common in 3.0?
                          write_byBlocks_binaryMode,
                          write_allAtOnce_textMode,    # not for very large files
                          write_allAtOnce_binaryMode)  # not for very large files
             }

    for filename in (textfile, binaryfile, outputfile):
        filesize = os.path.getsize(filename) if os.path.exists(filename) else '0'      # CHANGED
        version  = sys.version.split()[0]
        print('\n[Python {0}: {1}, {2} bytes]'.format(version, filename, filesize))

        for func in tests[filename]:
            try:
                testtime = timer(func, filename)
            except:
                print('%-26s => %s, %s' % (func.__name__, '*fail*', sys.exc_info()[0]))
            else:
                # int/int=float+remainder in 3.0, but not 2.6
                filemegs = float(filesize) / oneMeg
                testid   = '%-26s (%s=%.2fM)' % (func.__name__, filename, filemegs)
                print('%-46s  => %f' % (testid, testtime))

if __name__ == '__main__': 
    timePython()   # the version running me