aboutsummaryrefslogtreecommitdiff
path: root/contrib/gnunet-chk.py
blob: e5164127229384984c8f159b58b39ca16a9a4934 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
#!/usr/bin/python
# This file is part of GNUnet.
# (C) 2013 Christian Grothoff (and other contributing authors)
#
# GNUnet is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published
# by the Free Software Foundation; either version 3, or (at your
# option) any later version.
#
# GNUnet is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with GNUnet; see the file COPYING.  If not, write to the
# Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
# Boston, MA 02110-1301, USA.
# 
# File:    gnunet-chk.py
# Brief:   Computes GNUNET style Content Hash Key for a given file
# Author:  Sree Harsha Totakura

from hashlib import sha512
import logging
import os
import getopt
import sys
from Crypto.Cipher import AES

# Defaults
DBLOCK_SIZE = (32 * 1024)   # Data block size

# Pick a multiple of 2 here to achive 8-byte alignment!  We also
# probably want DBlocks to have (roughly) the same size as IBlocks.
# With SHA-512, the optimal value is 32768 byte / 128 byte = 256 (128
# byte = 2 * 512 bits).  DO NOT CHANGE!
CHK_PER_INODE = 256

CHK_HASH_SIZE = 64              # SHA-512 hash = 512 bits = 64 bytes

CHK_QUERY_SIZE = CHK_HASH_SIZE  # Again a SHA-512 hash

GNUNET_FS_URI_PREFIX = "gnunet://fs/" # FS CHK URI prefix

GNUNET_FS_URI_CHK_INFIX = "chk/" # FS CHK URI infix


def encode_data_to_string (data):
    """Returns an ASCII encoding of the given data block like
    GNUNET_STRINGS_data_to_string() function.
    
    data: A bytearray representing the block of data which has to be encoded
    """
    echart = "0123456789ABCDEFGHIJKLMNOPQRSTUV"    
    assert (None != data)
    assert (bytearray == type(data))
    size = len(data)
    assert (0 != size)
    vbit = 0
    wpos = 0
    rpos = 0
    bits = 0
    out = ""
    while (rpos < size) or (vbit > 0):
        if (rpos < size) and (vbit < 5):
            bits = (bits << 8) | data[rpos] # eat 8 more bits
            rpos += 1
            vbit += 8
        if (vbit < 5):
            bits <<= (5 - vbit) # zero-padding
            assert (vbit == ((size * 8) % 5))
            vbit = 5
        out += echart[(bits >> (vbit - 5)) & 31]
        wpos += 1
        vbit -= 5
    assert (0 == vbit)
    return out;


def sha512_hash (data):
    """ Returns the sha512 hash of the given data. 
    
    data: string to hash
    """
    hash_obj = sha512()
    hash_obj.update (data)
    return hash_obj.digest()


class AESKey:
    """Class for AES Keys. Contains the main key and the initialization
    vector. """

    key = None                  # The actual AES key
    iv = None                   # The initialization vector
    cipher = None               # The cipher object
    KEY_SIZE = 32               # AES 256-bit key = 32 bytes
    IV_SIZE = AES.block_size    # Initialization vector size (= AES block size)
    
    def __init__ (self, passphrase):
        """Creates a new AES key. 
        
        passphrase: string containing the passphrase to get the AES key and
                      initialization vector
        """
        passphrase = bytearray (passphrase);
        self.key = bytearray (self.KEY_SIZE)
        self.iv = bytearray (self.IV_SIZE)
        if (len (passphrase) > self.KEY_SIZE):
            self.key = passphrase[:self.KEY_SIZE]
            passphrase = passphrase[self.KEY_SIZE:]
            if (len (passphrase) > self.IV_SIZE):
                self.iv = passphrase[:self.IV_SIZE]
            else:
                self.iv[0:len (passphrase)] = passphrase
        else:
            self.key[0:len (passphrase)] = passphrase
        self.key = str (self.key)
        self.iv = str (self.iv)
        assert (len(self.key) == self.KEY_SIZE)
        assert (len(self.iv) == self.IV_SIZE)

def setup_aes_cipher_ (aes_key):
    """Initializes the AES object with settings similar to those in GNUnet.
    
    aes_key: the AESKey object
    Returns the newly initialized AES object
    """
    return AES.new (aes_key.key, AES.MODE_CFB, aes_key.iv, segment_size=128)

def aes_pad_ (data):
    """Adds padding to the data such that the size of the data is a multiple of
    16 bytes
    
    data: the data string
    Returns a tuple:(pad_len, data). pad_len denotes the number of bytes added
    as padding; data is the new data string with padded bytes at the end
    """
    pad_len = len(data) % 16
    if (0 != pad_len):
        pad_len = 16 - pad_len
        pad_bytes = bytearray (15)
        data += str(pad_bytes[:pad_len])
    return (pad_len, data)

def aes_encrypt (aes_key, data):
    """Encrypts the given data using AES.

    aes_key: the AESKey object to use for AES encryption
    data: the data string to encrypt
    """
    (pad_len, data) = aes_pad_ (data)
    cipher = setup_aes_cipher_ (aes_key)
    enc_data = cipher.encrypt (data)
    if (0 != pad_len):
        enc_data = enc_data[:-pad_len]
    return enc_data

def aes_decrypt (aes_key, data):
    """Decrypts the given data using AES
    
    aes_key: the AESKey object to use for AES decryption
    data: the data string to decrypt
    """
    (pad_len, data) = aes_pad_ (data)
    cipher = setup_aes_cipher_ (aes_key)
    ptext = cipher.decrypt (data)
    if (0 != pad_len):
        ptext = ptext[:-pad_len]
    return ptext


class Chk:
    """Class for the content hash key."""
    key = None
    query = None
    fsize = None

    def __init__(self, key, query):
        assert (len(key) == CHK_HASH_SIZE)
        assert (len(query) == CHK_QUERY_SIZE)
        self.key = key
        self.query = query

    def setSize(self, size):
        self.fsize = size

    def uri(self):
        sizestr = repr (self.fsize)
        if isinstance (self.fsize, long):
            sizestr = sizestr[:-1]            
        return GNUNET_FS_URI_PREFIX + GNUNET_FS_URI_CHK_INFIX + \
            encode_data_to_string(bytearray(self.key)) + "." + \
            encode_data_to_string(bytearray(self.query)) + "." + \
            sizestr


def compute_depth_(size):
    """Computes the depth of the hash tree.
    
    size: the size of the file whose tree's depth has to be computed
    Returns the depth of the tree. Always > 0.
    """
    depth = 1
    fl = DBLOCK_SIZE
    while (fl < size):
        depth += 1
        if ((fl * CHK_PER_INODE) < fl):
            return depth
        fl = fl * CHK_PER_INODE
    return depth

def compute_tree_size_(depth):
    """Calculate how many bytes of payload a block tree of the given depth MAY
     correspond to at most (this function ignores the fact that some blocks will
     only be present partially due to the total file size cutting some blocks
     off at the end).

     depth: depth of the block.  depth==0 is a DBLOCK.
     Returns the number of bytes of payload a subtree of this depth may
     correspond to.
     """
    rsize = DBLOCK_SIZE
    for cnt in range(0, depth):
        rsize *= CHK_PER_INODE
    return rsize

def compute_chk_offset_(depth, end_offset):
    """Compute the offset of the CHK for the current block in the IBlock
    above
    
    depth: depth of the IBlock in the tree (aka overall number of tree levels
             minus depth); 0 == DBLOCK
    end_offset: current offset in the overall file, at the *beginning* of the
                  block for DBLOCK (depth == 0), otherwise at the *end* of the
                  block (exclusive)
    Returns the offset in the list of CHKs in the above IBlock
    """
    bds = compute_tree_size_(depth)
    if (depth > 0):
        end_offset -= 1
    ret = end_offset / bds
    return ret % CHK_PER_INODE

def compute_iblock_size_(depth, offset):
    """Compute the size of the current IBLOCK.  The encoder is triggering the
    calculation of the size of an IBLOCK at the *end* (hence end_offset) of its
    construction.  The IBLOCK maybe a full or a partial IBLOCK, and this
    function is to calculate how long it should be.
    
    depth: depth of the IBlock in the tree, 0 would be a DBLOCK, must be > 0
             (this function is for IBLOCKs only!)
    offset: current offset in the payload (!) of the overall file, must be > 0
              (since this function is called at the end of a block).
    Returns the number of elements to be in the corresponding IBlock
    """
    assert (depth > 0)
    assert (offset > 0)
    bds = compute_tree_size_ (depth)
    mod = offset % bds
    if mod is 0:
        ret = CHK_PER_INODE
    else:
        bds /= CHK_PER_INODE
        ret = mod / bds
        if (mod % bds) is not 0:
            ret += 1
    return ret


def compute_rootchk(readin, size):
    """Returns the content hash key after generating the hash tree for the given
    input stream.

    readin: the stream where to read data from
    size: the size of data to be read
    """
    depth = compute_depth_(size);
    current_depth = 0
    chks = [None] * (depth * CHK_PER_INODE) # list buffer
    read_offset = 0
    logging.debug("Begining to calculate tree hash with depth: "+ repr(depth))
    while True:
        if (depth == current_depth):
            off = CHK_PER_INODE * (depth - 1)
            assert (chks[off] is not None)
            logging.debug("Encoding done, reading CHK `"+ chks[off].query + \
                              "' from "+ repr(off) +"\n")
            uri_chk = chks[off]
            assert (size == read_offset)
            uri_chk.setSize (size)
            return uri_chk
        if (0 == current_depth):
            pt_size = min(DBLOCK_SIZE, size - read_offset);
            try:
                pt_block = readin.read(pt_size)
            except IOError:
                logging.warning ("Error reading input file stream")
                return None
        else:
            pt_elements = compute_iblock_size_(current_depth, read_offset)
            pt_block = ""
            pt_block = \
                reduce ((lambda ba, chk:
                             ba + (chk.key + chk.query)),
                        chks[(current_depth - 1) * CHK_PER_INODE:][:pt_elements],
                        pt_block)
            pt_size = pt_elements * (CHK_HASH_SIZE + CHK_QUERY_SIZE)
        assert (len(pt_block) == pt_size)
        assert (pt_size <= DBLOCK_SIZE)
        off = compute_chk_offset_ (current_depth, read_offset)
        logging.debug ("Encoding data at offset "+ repr(read_offset) + \
                           " and depth "+ repr(current_depth) +" with block " \
                           "size "+ repr(pt_size) +" and target CHK offset "+ \
                           repr(current_depth * CHK_PER_INODE))
        pt_hash = sha512_hash (pt_block)
        pt_aes_key = AESKey (pt_hash)
        pt_enc = aes_encrypt (pt_aes_key, pt_block)
        pt_enc_hash = sha512_hash (pt_enc)
        chk = Chk(pt_hash, pt_enc_hash)
        chks[(current_depth * CHK_PER_INODE) + off] = chk
        if (0 == current_depth):
            read_offset += pt_size
            if (read_offset == size) or \
                    (0 == (read_offset % (CHK_PER_INODE * DBLOCK_SIZE))):
                current_depth += 1
        else:
            if (CHK_PER_INODE == off) or (read_offset == size):
                current_depth += 1
            else:
                current_depth = 0


def chkuri_from_path (path):
    """Returns the CHK URI of the file at the given path.
    
    path: the path of the file whose CHK has to be calculated
    """
    size = os.path.getsize (path)
    readin = open (path, "rb")
    chk = compute_rootchk (readin, size)
    readin.close()
    return chk.uri()

def usage ():
    """Prints help about using this script."""
    print """
Usage: gnunet-chk.py [options] file
Prints the Content Hash Key of given file in GNUNET-style URI.

Options:
    -h, --help                : prints this message
"""


if '__main__' == __name__:
    try:
        opts, args = getopt.getopt(sys.argv[1:], 
                                   "h", 
                                   ["help"])
    except getopt.GetoptError, err:
        print err
        print "Exception occured"
        usage()
        sys.exit(2)
    for option, value in opts:
        if option in ("-h", "--help"):
            usage()
            sys.exit(0)
    if len(args) != 1:
        print "Incorrect number of arguments passed"
        usage()
        sys.exit(1)
    print chkuri_from_path (args[0])