for i inrange(1, len(mods)): m = mods[i] # 注意这里,如果relu激活函数是inplace则不用计算 ifisinstance(m, nn.ReLU): if m.inplace: continue out = m(input_) out_sizes.append(np.array(out.size())) input_ = out
total_nums = 0 for i inrange(len(out_sizes)): s = out_sizes[i] nums = np.prod(np.array(s)) total_nums += nums
defmodelsize(model, input, type_size=4): para = sum([np.prod(list(p.size())) for p in model.parameters()]) print('Model {} : params: {:4f}M'.format(model._get_name(), para * type_size / 1000 / 1000))
for i inrange(1, len(mods)): m = mods[i] ifisinstance(m, nn.ReLU): if m.inplace: continue out = m(input_) out_sizes.append(np.array(out.size())) input_ = out
total_nums = 0 for i inrange(len(out_sizes)): s = out_sizes[i] nums = np.prod(np.array(s)) total_nums += nums
# fram = inspect.currentframe() # func_name = fram.f_code.co_name # filename = fram.f_globals["__file__"] # ss = os.path.dirname(os.path.abspath(filename)) # module_name = fram.f_globals["__name__"]
defgpu_profile(frame, event): # it is _about to_ execute (!) global last_tensor_sizes global lineno, func_name, filename, module_name
if event == 'line': try: # about _previous_ line (!) if lineno isnotNone: pynvml.nvmlInit() # handle = pynvml.nvmlDeviceGetHandleByIndex(int(os.environ['GPU_DEBUG'])) handle = pynvml.nvmlDeviceGetHandleByIndex(0) meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle) line = linecache.getline(filename, lineno) where_str = module_name+' '+func_name+':'+' line '+str(lineno) withopen(gpu_profile_fn, 'a+') as f: f.write(f"At {where_str:<50}" f"Total Used Memory:{meminfo.used/1024**2:<7.1f}Mb\n") if print_tensor_sizes isTrue: for tensor in get_tensors(): ifnothasattr(tensor, 'dbg_alloc_where'): tensor.dbg_alloc_where = where_str new_tensor_sizes = {(type(x), tuple(x.size()), np.prod(np.array(x.size()))*4/1024**2, x.dbg_alloc_where) for x in get_tensors()} for t, s, m, loc in new_tensor_sizes - last_tensor_sizes: f.write(f'+ {loc:<50}{str(s):<20}{str(m)[:4]} M {str(t):<10}\n') for t, s, m, loc in last_tensor_sizes - new_tensor_sizes: f.write(f'- {loc:<50}{str(s):<20}{str(m)[:4]} M {str(t):<10}\n') last_tensor_sizes = new_tensor_sizes pynvml.nvmlShutdown()
# save details about line _to be_ executed lineno = None
func_name = frame.f_code.co_name filename = frame.f_globals["__file__"] if (filename.endswith(".pyc") or filename.endswith(".pyo")): filename = filename[:-1] module_name = frame.f_globals["__name__"] lineno = frame.f_lineno
return gpu_profile
except Exception as e: print('A exception occured: {}'.format(e))
return gpu_profile
defget_tensors(): for obj in gc.get_objects(): try: if torch.is_tensor(obj): tensor = obj else: continue if tensor.is_cuda: yield tensor except Exception as e: print('A exception occured: {}'.format(e))