说明
如果需要用到这些知识却没有掌握,则会让人感到沮丧,也可能导致面试被拒。无论是花几天时间“突击”,还是利用零碎的时间持续学习,在数据结构上下点功夫都是值得的。那么Python 中有哪些数据结构呢?列表、字典、集合,还有……栈?Python 有栈吗?本系列文章将给出详细拼图。
第1章:ADT抽象数据类型,定义数据和其操作
什么是ADT: 抽象数据类型(Abstract Data Type),学过数据结构的应该都知道。
如何为 ADT 选择数据结构
- 数据结构是否满足 ADT 域指定的存储要求?
- 数据结构是否提供数据访问和操作功能来完全实现 ADT?
- 高效执行?基于复杂性分析。
下边代码是个简单的示例,比如实现一个简单的Bag类,先定义其具有的操作,然后我们再用类的magic method来实现这些方法:
class Bag: """ constructor: 构造函数 size contains append remove iter """ def __init__(self): self._items = list() def __len__(self): return len(self._items) def __contains__(self, item): return item in self._items def add(self, item): self._items.append(item) def remove(self, item): assert item in self._items, 'item must in the bag' return self._items.remove(item) def __iter__(self): return _BagIterator(self._items) class _BagIterator: """ 注意这里实现了迭代器类 """ def __init__(self, seq): self._bag_items = seq self._cur_item = 0 def __iter__(self): return self def __next__(self): if self._cur_item < len(self._bag_items): item = self._bag_items[self._cur_item] self._cur_item += 1 return item else: raise StopIteration b = Bag() b.add(1) b.add(2) for i in b: # for使用__iter__构建,用__next__迭代 print(i) """ # for 语句等价于 i = b.__iter__() while True: try: item = i.__next__() print(item) except StopIteration: break """
第2章:array 和 list
array: 定长,操作有限,但是节省内存;貌似我的生涯中还没用过,不过python3.5中我试了确实有array类,可以用import array直接导入
list: 会预先分配内存,操作丰富,但是耗费内存。我用sys.getsizeof做了实验。我个人理解很类似C++ STL里的vector,是使用最频繁的数据结构。
- list.append: 如果之前没有分配够内存,会重新开辟新区域,然后复制之前的数据,复杂度退化
- list.insert: 会移动被插入区域后所有元素,O(n)
- list.pop: pop不同位置需要的复杂度不同pop(0)是O(1)复杂度,pop()首位O(n)复杂度
- list[]: slice操作copy数据(预留空间)到另一个list
来实现一个array的ADT:
import ctypes class Array: def __init__(self, size): assert size > 0, 'array size must be > 0' self._size = size PyArrayType = ctypes.py_object * size self._elements = PyArrayType() self.clear(None) def __len__(self): return self._size def __getitem__(self, index): assert index >= 0 and index < len(self), 'out of range' return self._elements[index] def __setitem__(self, index, value): assert index >= 0 and index < len(self), 'out of range' self._elements[index] = value def clear(self, value): """ 设置每个元素为value """ for i in range(len(self)): self._elements[i] = value def __iter__(self): return _ArrayIterator(self._elements) class _ArrayIterator: def __init__(self, items): self._items = items self._idx = 0 def __iter__(self): return self def __next__(self): if self._idex < len(self._items): val = self._items[self._idx] self._idex += 1 return val else: raise StopIteration
2.1 二维数组Two-Demensional Arrays
class Array2D: """ 要实现的方法 Array2D(nrows, ncols): constructor numRows() numCols() clear(value) getitem(i, j) setitem(i, j, val) """ def __init__(self, numrows, numcols): self._the_rows = Array(numrows) # 数组的数组 for i in range(numrows): self._the_rows[i] = Array(numcols) @property def numRows(self): return len(self._the_rows) @property def NumCols(self): return len(self._the_rows[0]) def clear(self, value): for row in self._the_rows: row.clear(value) def __getitem__(self, ndx_tuple): # ndx_tuple: (x, y) assert len(ndx_tuple) == 2 row, col = ndx_tuple[0], ndx_tuple[1] assert (row >= 0 and row < self.numRows and col >= 0 and col < self.NumCols) the_1d_array = self._the_rows[row] return the_1d_array[col] def __setitem__(self, ndx_tuple, value): assert len(ndx_tuple) == 2 row, col = ndx_tuple[0], ndx_tuple[1] assert (row >= 0 and row < self.numRows and col >= 0 and col < self.NumCols) the_1d_array = self._the_rows[row] the_1d_array[col] = value
2.2 The Matrix ADT, m行,n列。这个最好用还是用pandas处理矩阵,自己实现比较*疼
class Matrix: """ 最好用pandas的DataFrame Matrix(rows, ncols): constructor numCols() getitem(row, col) setitem(row, col, val) scaleBy(scalar): 每个元素乘scalar transpose(): 返回transpose转置 add(rhsMatrix): size must be the same subtract(rhsMatrix) multiply(rhsMatrix) """ def __init__(self, numRows, numCols): self._theGrid = Array2D(numRows, numCols) self._theGrid.clear(0) @property def numRows(self): return self._theGrid.numRows @property def NumCols(self): return self._theGrid.numCols def __getitem__(self, ndxTuple): return self._theGrid[ndxTuple[0], ndxTuple[1]] def __setitem__(self, ndxTuple, scalar): self._theGrid[ndxTuple[0], ndxTuple[1]] = scalar def scaleBy(self, scalar): for r in range(self.numRows): for c in range(self.numCols): self[r, c] *= scalar def __add__(self, rhsMatrix): assert (rhsMatrix.numRows == self.numRows and rhsMatrix.numCols == self.numCols) newMartrix = Matrix(self.numRows, self.numCols) for r in range(self.numRows): for c in range(self.numCols): newMartrix[r, c] = self[r, c] + rhsMatrix[r, c]
第3章:Sets 和 Maps
除了list之外,最常用的应该就是python内置的set和dict了。
3.1 sets ADT
集合是一个容器,它存储给定可比域中唯一值的集合,其中存储的值没有特定的顺序。
class Set: """ 使用list实现set ADT Set() length() contains(element) add(element) remove(element) equals(element) isSubsetOf(setB) union(setB) intersect(setB) difference(setB) iterator() """ def __init__(self): self._theElements = list() def __len__(self): return len(self._theElements) def __contains__(self, element): return element in self._theElements def add(self, element): if element not in self: self._theElements.append(element) def remove(self, element): assert element in self, 'The element must be set' self._theElements.remove(element) def __eq__(self, setB): if len(self) != len(setB): return False else: return self.isSubsetOf(setB) def isSubsetOf(self, setB): for element in self: if element not in setB: return False return True def union(self, setB): newSet = Set() newSet._theElements.extend(self._theElements) for element in setB: if element not in self: newSet._theElements.append(element) return newSet
3.2 Maps or Dict: 键值对,python内部采用hash实现。
class Map: """ Map ADT list implemention Map() length() contains(key) add(key, value) remove(key) valudOf(key) iterator() """ def __init__(self): self._entryList = list() def __len__(self): return len(self._entryList) def __contains__(self, key): ndx = self._findPosition(key) return ndx is not None def add(self, key, value): ndx = self._findPosition(key) if ndx is not None: self._entryList[ndx].value = value return False else: entry = _MapEntry(key, value) self._entryList.append(entry) return True def valueOf(self, key): ndx = self._findPosition(key) assert ndx is not None, 'Invalid map key' return self._entryList[ndx].value def remove(self, key): ndx = self._findPosition(key) assert ndx is not None, 'Invalid map key' self._entryList.pop(ndx) def __iter__(self): return _MapIterator(self._entryList) def _findPosition(self, key): for i in range(len(self)): if self._entryList[i].key == key: return i return None class _MapEntry: # or use collections.namedtuple('_MapEntry', 'key,value') def __init__(self, key, value): self.key = key self.value = value
3.3 The multiArray ADT, 多维数组,一般是使用一个一维数组模拟,然后通过计算下标获取元素
class MultiArray: """ row-major or column-marjor ordering, this is row-major ordering MultiArray(d1, d2, ...dn) dims(): the number of dimensions length(dim): the length of given array dimension clear(value) getitem(i1, i2, ... in), index(i1,i2,i3) = i1*(d2*d3) + i2*d3 + i3 setitem(i1, i2, ... in) 计算下标:index(i1,i2,...in) = i1*f1 + i2*f2 + ... + i(n-1)*f(n-1) + in*1 """ def __init__(self, *dimensions): # Implementation of MultiArray ADT using a 1-D # array,数组的数组的数组。。。 assert len(dimensions) > 1, 'The array must have 2 or more dimensions' self._dims = dimensions # Compute to total number of elements in the array size = 1 for d in dimensions: assert d > 0, 'Dimensions must be > 0' size *= d # Create the 1-D array to store the elements self._elements = Array(size) # Create a 1-D array to store the equation factors self._factors = Array(len(dimensions)) self._computeFactors() @property def numDims(self): return len(self._dims) def length(self, dim): assert dim > 0 and dim < len(self._dims), 'Dimension component out of range' return self._dims[dim-1] def clear(self, value): self._elements.clear(value) def __getitem__(self, ndxTuple): assert len(ndxTuple) == self.numDims, 'Invalid # of array subscripts' index = self._computeIndex(ndxTuple) assert index is not None, 'Array subscript out of range' return self._elements[index] def __setitem__(self, ndxTuple, value): assert len(ndxTuple) == self.numDims, 'Invalid # of array subscripts' index = self._computeIndex(ndxTuple) assert index is not None, 'Array subscript out of range' self._elements[index] = value def _computeIndex(self, ndxTuple): # using the equation: i1*f1 + i2*f2 + ... + in*fn offset = 0 for j in range(len(ndxTuple)): if ndxTuple[j] < 0 or ndxTuple[j] >= self._dims[j]: return None else: offset += ndexTuple[j] * self._factors[j] return offset
第4章:Algorithm Analysis
一般使用大O标记法来衡量算法的平均时间复杂度, 1 < log(n) < n < nlog(n) < n^2 < n^3 < a^n。 了解常用数据结构操作的平均时间复杂度有利于使用更高效的数据结构,当然有时候需要在时间和空间上进行衡量,有些操作甚至还会退化,比如list的append操作,如果list空间不够,会去开辟新的空间,操作复杂度退化到O(n),有时候还需要使用均摊分析(amortized)