import types, string, re

"""
Unserialize class for the PHP serialization format.

@version v0.4 BETA
@author Scott Hurring; scott at hurring dot com
@copyright Copyright (c) 2005 Scott Hurring
@license http://opensource.org/licenses/gpl-license.php GNU Public License
$Id: PHPUnserialize.py,v 1.1 2006/01/08 21:53:19 shurring Exp $

Most recent version can be found at:
http://hurring.com/code/python/phpserialize/

Usage:
# Create an instance of the unserialize engine
u = PHPUnserialize()
# unserialize some string into python data
data = u.unserialize(serialized_string)

Please see README.txt for more information.
"""

class PHPUnserialize(object):
	"""
	Class to unserialize something from the PHP Serialize format.

	Usage:
	u = PHPUnserialize()
	data = u.unserialize(serialized_string)
	"""

	def __init__(self):
		pass

	def session_decode(self, data):
		"""Thanks to Ken Restivo for suggesting the addition
		of session_encode
		"""
		session = {}
		while len(data) > 0:
			m = re.match('^(\w+)\|', data)
			if m:
				key = m.group(1)
				offset = len(key)+1
				(dtype, dataoffset, value) = self._unserialize(data, offset)
				offset = offset + dataoffset
				data = data[offset:]
				session[key] = value
			else:
				# No more stuff to decode 
				return session
		
		return session
		
	def unserialize(self, data):
		return self._unserialize(data, 0)[2]

	def _unserialize(self, data, offset=0):
		"""
		Find the next token and unserialize it.
		Recurse on array.

		offset = raw offset from start of data
		
		return (type, offset, value)
		"""

		buf = []
		dtype = string.lower(data[offset:offset+1])

		#print "# dtype =", dtype

		# 't:' = 2 chars
		dataoffset = offset + 2
		typeconvert = lambda x : x
		chars = datalength = 0

 		# int => Integer
		if dtype == 'i':
			typeconvert = lambda x : int(x)
			(chars, readdata) = self.read_until(data, dataoffset, ';')
			# +1 for end semicolon
			dataoffset += chars + 1

 		# bool => Boolean
		elif dtype == 'b':
			typeconvert = lambda x : (int(x) == 1)
			(chars, readdata) = self.read_until(data, dataoffset, ';')
			# +1 for end semicolon
			dataoffset += chars + 1

		# double => Floating Point
		elif dtype == 'd':
			typeconvert = lambda x : float(x)
			(chars, readdata) = self.read_until(data, dataoffset, ';')
			# +1 for end semicolon
			dataoffset += chars + 1

		# n => None
		elif dtype == 'n':
			readdata = None

		# s => String
		elif dtype == 's':
			(chars, stringlength) = self.read_until(data, dataoffset, ':')
			# +2 for colons around length field
			dataoffset += chars + 2

			# +1 for start quote
			(chars, readdata) = self.read_chars(data, dataoffset+1, int(stringlength))
			# +2 for endquote semicolon
			dataoffset += chars + 2

			if chars != int(stringlength) != int(readdata):
				raise Exception("String length mismatch")

		# array => Dict
		# If you originally serialized a Tuple or List, it will
		# be unserialized as a Dict.  PHP doesn't have tuples or lists,
		# only arrays - so everything has to get converted into an array
		# when serializing and the original type of the array is lost
		elif dtype == 'a':
			readdata = {}

			# If all dict keys are increasing ints from zero, 
			#	then make it a list.
			# Else, 
			#	just return the original dict
			def is_list(keys):
				# list with no elements
				if len(keys) == 0:
					return True
				# list with one element: 0
				if len(keys) == 1:
					if type(keys[0]) == int and keys[0] == 0:
						return True
					else:
						return False

				# all other cases
				for i in range(1,len(keys)):
					x = keys[i-1]
					y = keys[i]
					if type(x) == int and type(y) == int and y-x == 1:
						continue
					else:
						return False
				return True

			# Empty {} and empty [] are ambiguous. 
			# A default could either be configured or assumed to be a list?
			def dict_to_list(dict):

				keys = dict.keys()
				keys.sort()

				if is_list(keys):
					return [dict[k] for k in keys]
				else:
					return dict
				
			typeconvert = lambda x : dict_to_list(x)

			# How many keys does this list have?
			(chars, keys) = self.read_until(data, dataoffset, ':')
			# +2 for colons around length field
			dataoffset += chars + 2

			# Loop through and fetch this number of key/value pairs
			for i in range(0, int(keys)):
				# Read the key
				(ktype, kchars, key) = self._unserialize(data, dataoffset)
				dataoffset += kchars
				#print "Key(%i) = (%s, %i, %s) %i" % (i, ktype, kchars, key, dataoffset)

				# Read value of the key
				(vtype, vchars, value) = self._unserialize(data, dataoffset)
				dataoffset += vchars
				#print "Value(%i) = (%s, %i, %s) %i" % (i, vtype, vchars, value, dataoffset)

				# Set the list element
				readdata[key] = value

				# +1 for end semicolon
			dataoffset += 1
			#chars = int(dataoffset) - start

		# I don't know how to unserialize this
		else:
			raise Exception("Unknown / Unhandled data type (%s)!" % dtype)


		return (dtype, dataoffset-offset, typeconvert(readdata))

	def read_until(self, data, offset, stopchar):
		"""
		Read from data[offset] until you encounter some char 'stopchar'.
		"""
		buf = []
		char = data[offset:offset+1]
		i = 2
		while char != stopchar:
			# Consumed all the characters and havent found ';'
			if i+offset > len(data):
				raise Exception("Invalid")
			buf.append(char)
			char = data[offset+(i-1):offset+i]
			i += 1

		# (chars_read, data)
		return (len(buf), "".join(buf))

	def read_chars(self, data, offset, length):
		"""
		Read 'length' number of chars from data[offset].
		"""
		buf = []
		# Account for the starting quote char
		#offset += 1
		for i in range(0, length):
			char = data[offset+(i-1):offset+i]
			buf.append(char)

		# (chars_read, data)
		return (len(buf), "".join(buf))