中国建设网站下载个人网站特点-万宁市网站建设公司-Seo优化

中国建设网站下载,个人网站特点,沈阳做网站企业,搜索词热度查询HDF5完整文件结构与操作指南目录完整文件结构概览基础数据集类型组结构操作属性系统高级数据类型引用和链接压缩和分块可扩展数据集维度标签完整示例代码 1. 完整文件结构概览 1.1 理想的HDF5文件结构 comprehensive_example.h5 # 根文件 │ ├── metadata/ # 元数据组 │ ├── attrs: {title, author, version} # 组属性 │ ├── description (string) # 描述文本 │ ├── creation_date (datetime) # 创建日期 │ └── parameters (structured) # 参数结构体 │ ├── raw_data/ # 原始数据组 │ ├── sensor_1 (1D float32) # 1维数据 │ ├── sensor_2 (1D float32) │ ├── images (3D uint8) # 3维图像数据 │ │ ├── attrs: {resolution, units} # 数据集属性 │ │ └── dims: [time, height, width] # 维度标签 │ └── measurements (2D float64) # 2维测量数据 │ ├── processed_data/ # 处理后数据组 │ ├── filtered (2D compressed) # 压缩数据集 │ ├── normalized (2D chunked) # 分块数据集 │ └── statistics (compound type) # 复合数据类型 │ ├── models/ # 模型组 │ ├── neural_network/ # 神经网络子组 │ │ ├── layer_1_weights (2D) │ │ ├── layer_1_biases (1D) │ │ ├── layer_2_weights (2D) │ │ └── layer_2_biases (1D) │ └── config/ │ └── hyperparameters (JSON) │ ├── time_series/ # 时间序列数据 │ ├── data (resizable 2D) # 可扩展数据集 │ ├── timestamps (1D datetime) │ └── labels (1D categorical) │ ├── references/ # 引用和链接 │ ├── link_to_raw - /raw_data # 软链接 │ ├── external_link - file.h5:/data # 外部链接 │ └── object_references (refs) # 对象引用 │ ├── special_types/ # 特殊数据类型 │ ├── string_array (variable length)# 变长字符串 │ ├── boolean_mask (bool) # 布尔类型 │ ├── enum_data (enum) # 枚举类型 │ ├── complex_numbers (complex) # 复数 │ └── nested_compound (nested) # 嵌套结构体 │ └── large_data/ # 大数据集 ├── chunked_compressed (gzip) # 分块压缩 ├── lzf_compressed (lzf) # LZF压缩 └── virtual_dataset (virtual) # 虚拟数据集2. 基础数据集类型2.1 数值类型数据集2.1.1 整数类型importh5pyimportnumpyasnpwithh5py.File(example.h5,w)asf:# 有符号整数f.create_dataset(int8_data,datanp.array([1,2,3],dtypenp.int8))f.create_dataset(int16_data,datanp.array([100,200],dtypenp.int16))f.create_dataset(int32_data,datanp.array([1000,2000],dtypenp.int32))f.create_dataset(int64_data,datanp.array([10000,20000],dtypenp.int64))# 无符号整数f.create_dataset(uint8_data,datanp.array([255,128],dtypenp.uint8))f.create_dataset(uint16_data,datanp.array([65535],dtypenp.uint16))f.create_dataset(uint32_data,datanp.array([4294967295],dtypenp.uint32))f.create_dataset(uint64_data,datanp.array([2**63],dtypenp.uint64))# 读取操作withh5py.File(example.h5,r)asf:int8_dataf[int8_data][:]print(f数据类型:{int8_data.dtype})print(f数据:{int8_data})2.1.2 浮点类型withh5py.File(example.h5,w)asf:# 单精度浮点f.create_dataset(float32_data,datanp.array([3.14,2.71],dtypenp.float32))# 双精度浮点f.create_dataset(float64_data,datanp.array([3.141592653589793],dtypenp.float64))# 半精度浮点节省空间f.create_dataset(float16_data,datanp.array([1.5,2.5],dtypenp.float16))# 读取并查看精度withh5py.File(example.h5,r)asf:fornamein[float32_data,float64_data,float16_data]:dataf[name][:]print(f{name}: dtype{data.dtype}, precision{data.itemsize*8}bits)2.1.3 复数类型withh5py.File(example.h5,w)asf:# 复数类型complex_datanp.array([12j,34j,56j],dtypenp.complex64)f.create_dataset(complex64,datacomplex_data)# 双精度复数complex_data_highnp.array([12j,34j],dtypenp.complex128)f.create_dataset(complex128,datacomplex_data_high)# 读取和处理复数withh5py.File(example.h5,r)asf:c_dataf[complex64][:]print(f实部:{c_data.real})print(f虚部:{c_data.imag})print(f模:{np.abs(c_data)})print(f相位:{np.angle(c_data)})2.1.4 布尔类型withh5py.File(example.h5,w)asf:# 布尔数组bool_datanp.array([True,False,True,True],dtypebool)f.create_dataset(boolean_mask,databool_data)# 布尔矩阵用于掩码bool_matrixnp.random.rand(100,100)0.5f.create_dataset(random_mask,databool_matrix)# 读取和应用掩码withh5py.File(example.h5,r)asf:maskf[random_mask][:]# 可以用于过滤数据datanp.random.randn(100,100)filtered_datadata[mask]print(f掩码选中了{mask.sum()}个元素)2.2 字符串类型数据集2.2.1 固定长度字符串withh5py.File(example.h5,w)asf:# 固定长度ASCII字符串fixed_stringsnp.array([bhello,bworld,btest],dtypeS10)f.create_dataset(fixed_ascii,datafixed_strings)# 固定长度Unicode字符串fixed_unicodenp.array([你好,世界,测试],dtypeU10)f.create_dataset(fixed_unicode,datafixed_unicode)# 读取字符串withh5py.File(example.h5,r)asf:ascii_dataf[fixed_ascii][:]unicode_dataf[fixed_unicode][:]print(fASCII:{ascii_data})print(fUnicode:{unicode_data})2.2.2 变长字符串withh5py.File(example.h5,w)asf:# 变长ASCII字符串dt_asciih5py.string_dtype(encodingascii)var_strings[short,a very long string,medium]f.create_dataset(variable_ascii,datavar_strings,dtypedt_ascii)# 变长UTF-8字符串dt_utf8h5py.string_dtype(encodingutf-8)var_unicode[短,这是一个很长的中文字符串,中等长度]f.create_dataset(variable_utf8,datavar_unicode,dtypedt_utf8)# 读取变长字符串withh5py.File(example.h5,r)asf:var_asciif[variable_ascii][:]var_utf8f[variable_utf8][:]print(f变长ASCII:{var_ascii})print(f变长UTF-8:{var_utf8})# 单个元素访问print(f第一个元素:{f[variable_utf8][0]})2.3 多维数组2.3.1 一维数组向量withh5py.File(example.h5,w)asf:# 时间序列数据time_seriesnp.sin(np.linspace(0,10*np.pi,1000))dsetf.create_dataset(time_series,datatime_series)dset.attrs[description]Sine wavedset.attrs[sampling_rate]100# Hz# 读取和处理withh5py.File(example.h5,r)asf:tsf[time_series]print(f形状:{ts.shape})print(f长度:{len(ts)})print(f采样率:{ts.attrs[sampling_rate]}Hz)# 切片读取first_100ts[:100]last_100ts[-100:]2.3.2 二维数组矩阵withh5py.File(example.h5,w)asf:# 图像数据灰度图imagenp.random.randint(0,256,(512,512),dtypenp.uint8)dsetf.create_dataset(grayscale_image,dataimage)dset.attrs[height]512dset.attrs[width]512dset.attrs[channels]1# 表格数据table_datanp.random.randn(1000,50)# 1000行50列f.create_dataset(table_data,datatable_data)# 读取操作withh5py.File(example.h5,r)asf:# 读取整个图像imgf[grayscale_image][:]# 读取图像的一部分ROIroif[grayscale_image][100:200,100:200]# 读取表格的特定行列col_5f[table_data][:,5]# 第5列row_10f[table_data][10,:]# 第10行subsetf[table_data][0:100,0:10]# 子集2.3.3 三维数组体数据withh5py.File(example.h5,w)asf:# RGB图像序列videonp.random.randint(0,256,(100,480,640,3),dtypenp.uint8)dsetf.create_dataset(video_rgb,datavideo)dset.attrs[num_frames]100dset.attrs[height]480dset.attrs[width]640dset.attrs[channels]3dset.attrs[fps]30# 3D医学图像CT扫描ct_scannp.random.randn(256,256,128)# [x, y, z]f.create_dataset(ct_scan,datact_scan)# 读取操作withh5py.File(example.h5,r)asf:# 读取特定帧frame_10f[video_rgb][10,:,:,:]# 读取时间切片time_slicef[video_rgb][0:50:5,:,:,:]# 每5帧取一帧# 读取空间切片spatial_slicef[video_rgb][:,100:200,200:300,:]2.3.4 四维及更高维数组withh5py.File(example.h5,w)asf:# 4D: [batch, height, width, channels]batch_imagesnp.random.randn(32,224,224,3)f.create_dataset(image_batch,databatch_images)# 5D: [time, batch, height, width, channels]video_batchnp.random.randn(10,16,64,64,3)f.create_dataset(video_batch,datavideo_batch)# 读取操作withh5py.File(example.h5,r)asf:# 读取特定批次batch_0f[image_batch][0,:,:,:]# 复杂切片subsetf[video_batch][0:5,0:8,::2,::2,:]# 降采样3. 组结构操作3.1 创建和组织组withh5py.File(example.h5,w)asf:# 方法1: 直接创建组group1f.create_group(level1)# 方法2: 创建嵌套组group2f.create_group(level1/level2)# 方法3: 使用require_group存在则返回不存在则创建group3f.require_group(level1/level2/level3)# 在组中创建数据集group1.create_dataset(data1,datanp.arange(10))group2.create_dataset(data2,datanp.arange(20))group3.create_dataset(data3,datanp.arange(30))# 创建多个平行组foriinrange(5):groupf.create_group(fexperiment_{i})group.create_dataset(results,datanp.random.randn(100))group.attrs[experiment_id]i group.attrs[timestamp]f2024-01-{i1:02d}3.2 遍历组结构defprint_structure(name,obj):递归打印HDF5结构indent *name.count(/)ifisinstance(obj,h5py.Group):print(f{indent}{name}/)elifisinstance(obj,h5py.Dataset):print(f{indent}{name}{obj.shape}{obj.dtype})withh5py.File(example.h5,r)asf:print(方法1: visititems)f.visititems(print_structure)print(\n方法2: 手动遍历)defrecursive_print(group,level0):forkeyingroup.keys():itemgroup[key]indent *levelifisinstance(item,h5py.Group):print(f{indent}{key}/)recursive_print(item,level1)else:print(f{indent}{key}{item.shape})recursive_print(f)print(\n方法3: 只遍历特定组)forkeyinf[level1].keys():print(fFound:{key})3.3 移动、复制和删除组withh5py.File(example.h5,a)asf:# 复制组f.copy(level1,level1_copy)# 移动组重命名f.move(level1_copy,level1_backup)# 删除组delf[level1_backup]# 复制到另一个文件withh5py.File(destination.h5,w)asf_dest:f.copy(level1,f_dest,nameimported_data)3.4 按条件查找数据集deffind_datasets(group,condition):查找满足条件的数据集results[]defsearch(name,obj):ifisinstance(obj,h5py.Dataset):ifcondition(name,obj):results.append(name)group.visititems(search)returnresultswithh5py.File(example.h5,r)asf:# 查找所有浮点数据集float_datasetsfind_datasets(f,lambdaname,obj:obj.dtype.kindf)# 查找大于特定大小的数据集large_datasetsfind_datasets(f,lambdaname,obj:obj.size1000)# 查找包含特定属性的数据集with_attrfind_datasets(f,lambdaname,obj:experiment_idinobj.attrs)print(f浮点数据集:{float_datasets})print(f大数据集:{large_datasets})print(f有experiment_id属性:{with_attr})4. 属性系统4.1 文件级属性withh5py.File(example.h5,w)asf:# 字符串属性f.attrs[title]My Research Dataf.attrs[author]Josh Wittf.attrs[institution]University# 数值属性f.attrs[version]1.0f.attrs[year]2024# 数组属性f.attrs[dimensions][1024,768]f.attrs[channels][0,1,2]# 日期时间存储为字符串fromdatetimeimportdatetime f.attrs[created]datetime.now().isoformat()# 布尔属性f.attrs[is_validated]Truef.attrs[is_published]False# 读取文件属性withh5py.File(example.h5,r)asf:print(文件属性:)forkey,valueinf.attrs.items():print(f{key}:{value})4.2 组级属性withh5py.File(example.h5,w)asf:# 为不同的实验组添加元数据forexp_idinrange(3):groupf.create_group(fexperiment_{exp_id})# 实验参数group.attrs[temperature]20exp_id*5# ℃group.attrs[pressure]1.0exp_id*0.1# atmgroup.attrs[duration]3600# seconds# 实验状态group.attrs[status]completedgroup.attrs[quality_score]0.95# 参考信息group.attrs[reference_paper]Smith et al., 2024group.attrs[doi]f10.1234/journal.{exp_id}# 读取和筛选withh5py.File(example.h5,r)asf:# 找出所有高温实验high_temp_exps[]forkeyinf.keys():iff[key].attrs.get(temperature,0)25:high_temp_exps.append(key)print(f高温实验:{high_temp_exps})4.3 数据集级属性withh5py.File(example.h5,w)asf:# 创建数据集并添加详细元数据datanp.random.randn(1000,100)dsetf.create_dataset(measurements,datadata)# 物理单位和量程dset.attrs[units]meters per seconddset.attrs[range][data.min(),data.max()]dset.attrs[mean]data.mean()dset.attrs[std]data.std()# 采集信息dset.attrs[sampling_rate]1000# Hzdset.attrs[num_channels]100dset.attrs[calibration_factor]1.05# 质量控制dset.attrs[outliers_removed]5dset.attrs[missing_values]0dset.attrs[validated]True# 处理历史dset.attrs[preprocessing]bandpass filter 0.1-100 Hzdset.attrs[detrended]True# 使用属性进行数据处理withh5py.File(example.h5,r)asf:dsetf[measurements]# 根据属性校准数据calibrationdset.attrs[calibration_factor]datadset[:]*calibration# 显示统计信息print(f单位:{dset.attrs[units]})print(f均值:{dset.attrs[mean]:.2f})print(f标准差:{dset.attrs[std]:.2f})4.4 修改和删除属性withh5py.File(example.h5,a)asf:dsetf[measurements]# 修改现有属性dset.attrs[version]2.0# 添加新属性dset.attrs[last_modified]datetime.now().isoformat()# 删除属性iftemporaryindset.attrs:deldset.attrs[temporary]# 批量更新属性new_attrs{processed:True,algorithm:FFT,window:Hamming}dset.attrs.update(new_attrs)5. 高级数据类型5.1 复合数据类型结构体# 定义复合数据类型dtnp.dtype([(name,S50),# 固定长度字符串(age,i4),# 32位整数(height,f4),# 32位浮点(weight,f4),(is_active,?),# 布尔(scores,f4,(3,))# 固定长度数组])withh5py.File(example.h5,w)asf:# 创建结构化数据集datanp.array([(bAlice,25,165.5,55.2,True,[90,85,88]),(bBob,30,175.0,70.5,True,[78,82,80]),(bCharlie,28,180.2,75.0,False,[95,92,89])],dtypedt)dsetf.create_dataset(people,datadata)dset.attrs[description]Personnel records# 读取和访问withh5py.File(example.h5,r)asf:dataf[people][:]# 访问特定字段namesdata[name]agesdata[age]scoresdata[scores]# 访问特定记录first_persondata[0]print(f第一个人:{first_person[name]}, 年龄:{first_person[age]})# 筛选数据active_peopledata[data[is_active]]adultsdata[data[age]18]5.2 嵌套复合类型# 定义嵌套结构address_dtypenp.dtype([(street,S100),(city,S50),(zipcode,i4)])person_dtypenp.dtype([(id,i4),(name,S50),(address,address_dtype),# 嵌套结构(salary,f8)])withh5py.File(example.h5,w)asf:datanp.array([(1,bAlice,(b123 Main St,bBoston,12345),75000.0),(2,bBob,(b456 Oak Ave,bNYC,10001),85000.0)],dtypeperson_dtype)f.create_dataset(employees,datadata)# 读取嵌套数据withh5py.File(example.h5,r)asf:dataf[employees][:]# 访问嵌套字段citiesdata[address][city]zipcodesdata[address][zipcode]print(f员工城市:{cities})print(f邮编:{zipcodes})5.3 枚举类型# 创建枚举类型status_enumh5py.enum_dtype({PENDING:0,RUNNING:1,COMPLETED:2,FAILED:3},basetypei)withh5py.File(example.h5,w)asf:# 使用枚举类型statusesnp.array([0,1,2,1,2,3],dtypestatus_enum)dsetf.create_dataset(task_status,datastatuses)# 读取枚举withh5py.File(example.h5,r)asf:statusesf[task_status][:]# 统计各状态数量unique,countsnp.unique(statuses,return_countsTrue)forval,countinzip(unique,counts):print(f状态{val}:{count}个)5.4 变长数据类型withh5py.File(example.h5,w)asf:# 变长整数数组vlen_inth5py.vlen_dtype(np.dtype(int32))datanp.array([[1,2,3],[4,5],[6,7,8,9,10]],dtypeobject)f.create_dataset(variable_length_arrays,datadata,dtypevlen_int)# 变长浮点数组vlen_floath5py.vlen_dtype(np.dtype(float64))ragged_datanp.array([[1.1,2.2],[3.3,4.4,5.5,6.6],[7.7]],dtypeobject)f.create_dataset(ragged_arrays,dataragged_data,dtypevlen_float)# 读取变长数据withh5py.File(example.h5,r)asf:vlen_dataf[variable_length_arrays][:]fori,arrinenumerate(vlen_data):print(f行{i}: 长度{len(arr)}, 数据{arr})6. 引用和链接6.1 软链接Soft Linkswithh5py.File(example.h5,w)asf:# 创建原始数据datanp.arange(100)f.create_dataset(data/original,datadata)# 创建软链接f[link_to_original]h5py.SoftLink(/data/original)# 在其他组中创建链接f.create_group(analysis)f[analysis/data_link]h5py.SoftLink(/data/original)# 使用软链接withh5py.File(example.h5,r)asf:# 通过链接访问数据data_via_linkf[link_to_original][:]data_originalf[data/original][:]# 验证是否指向同一数据print(f数据相同:{np.array_equal(data_via_link,data_original)})6.2 硬链接Hard Linkswithh5py.File(example.h5,w)asf:# 创建数据集datanp.random.randn(100)dsetf.create_dataset(original_data,datadata)# 创建硬链接多个名称指向同一对象f[copy1]dset# 硬链接f[copy2]dset# 另一个硬链接# 删除原始名称数据仍然存在delf[original_data]# 通过其他名称仍可访问data_via_copyf[copy1][:]6.3 外部链接External Links# 创建源文件withh5py.File(source.h5,w)asf:f.create_dataset(external_data,datanp.arange(1000))# 创建带外部链接的文件withh5py.File(main.h5,w)asf:# 链接到另一个文件的数据集f[linked_data]h5py.ExternalLink(source.h5,/external_data)# 链接到另一个文件的组f[linked_group]h5py.ExternalLink(source.h5,/)# 使用外部链接withh5py.File(main.h5,r)asf:# 自动访问外部文件的数据dataf[linked_data][:]print(f从外部文件读取的数据:{data[:10]})6.4 对象引用Object Referenceswithh5py.File(example.h5,w)asf:# 创建多个数据集dset1f.create_dataset(dataset_1,datanp.arange(10))dset2f.create_dataset(dataset_2,datanp.arange(20))dset3f.create_dataset(dataset_3,datanp.arange(30))# 创建对象引用数组ref_dtypeh5py.ref_dtype refsnp.array([dset1.ref,dset2.ref,dset3.ref],dtyperef_dtype)f.create_dataset(dataset_references,datarefs)# 使用对象引用withh5py.File(example.h5,r)asf:refsf[dataset_references][:]# 通过引用访问对象fori,refinenumerate(refs):dsetf[ref]print(f引用{i}指向:{dset.name}, 形状:{dset.shape})datadset[:]print(f 数据:{data})6.5 区域引用Region Referenceswithh5py.File(example.h5,w)asf:# 创建一个大数据集datanp.arange(1000).reshape(100,10)dsetf.create_dataset(large_dataset,datadata)# 创建区域引用# 引用特定的行region1dset.regionref[0:10,:]# 前10行region2dset.regionref[50:60,:]# 中间10行# 引用特定的列region3dset.regionref[:,0:5]# 前5列# 引用特定的矩形区域region4dset.regionref[20:30,3:7]# 子矩阵# 存储区域引用ref_dtypeh5py.regionref_dtype regionsnp.array([region1,region2,region3,region4],dtyperef_dtype)f.create_dataset(regions,dataregions)# 使用区域引用withh5py.File(example.h5,r)asf:dsetf[large_dataset]regionsf[regions][:]fori,regioninenumerate(regions):# 通过区域引用读取数据region_datadset[region]print(f区域{i}: 形状 {region_data.shape})print(f 数据样本:{region_data.ravel()[:5]})7. 压缩和分块7.1 压缩方法对比importtime# 创建测试数据test_datanp.random.randn(10000,1000).astype(float32)withh5py.File(compression_test.h5,w)asf:# 无压缩starttime.time()f.create_dataset(no_compression,datatest_data)time_no_comptime.time()-start# GZIP压缩级别1-9forlevelin[1,4,9]:starttime.time()f.create_dataset(fgzip_level_{level},datatest_data,compressiongzip,compression_optslevel)time_gziptime.time()-startprint(fGZIP级别{level}:{time_gzip:.2f}秒)# LZF压缩starttime.time()f.create_dataset(lzf_compression,datatest_data,compressionlzf)time_lzftime.time()-startprint(fLZF:{time_lzf:.2f}秒)# SZIP压缩需要特殊编译的HDF5try:f.create_dataset(szip_compression,datatest_data,compressionszip,compression_opts(nn,16))except:print(SZIP不可用)# 比较文件大小和读取速度importoswithh5py.File(compression_test.h5,r)asf:fornameinf.keys():dsetf[name]# 读取速度测试starttime.time()_dset[:]read_timetime.time()-start# 获取存储大小storage_sizedset.id.get_storage_size()print(f{name}:)print(f 存储大小:{storage_size/1024/1024:.2f}MB)print(f 读取时间:{read_time:.3f}秒)7.2 分块策略withh5py.File(chunking_test.h5,w)asf:datanp.random.randn(10000,10000).astype(float32)# 自动分块f.create_dataset(auto_chunks,datadata,chunksTrue)# 按行分块适合行遍历f.create_dataset(row_chunks,datadata,chunks(100,10000))# 100行一块# 按列分块适合列遍历f.create_dataset(col_chunks,datadata,chunks(10000,100))# 100列一块# 方块分块适合随机访问f.create_dataset(square_chunks,datadata,chunks(1000,1000))# 1000x1000的块# 小块分块f.create_dataset(small_chunks,datadata,chunks(10,10))# 测试不同访问模式的性能withh5py.File(chunking_test.h5,r)asf:fornamein[row_chunks,col_chunks,square_chunks]:dsetf[name]# 行访问测试starttime.time()foriinrange(0,10000,1000):_dset[i,:]row_timetime.time()-start# 列访问测试starttime.time()forjinrange(0,10000,1000):_dset[:,j]col_timetime.time()-startprint(f{name}:)print(f 行访问:{row_time:.2f}秒)print(f 列访问:{col_time:.2f}秒)7.3 最优分块大小计算defcalculate_optimal_chunk_size(shape,dtype,target_chunk_size_mb1): 计算最优分块大小参数: shape: 数据集形状 dtype: 数据类型 target_chunk_size_mb: 目标块大小MB element_sizenp.dtype(dtype).itemsize target_elements(target_chunk_size_mb*1024*1024)/element_size# 尝试保持原始形状的比例ndimlen(shape)chunk_shapelist(shape)total_elementsnp.prod(shape)iftotal_elementstarget_elements:returntuple(shape)# 缩小各维度scale(target_elements/total_elements)**(1/ndim)chunk_shape[max(1,int(dim*scale))fordiminshape]returntuple(chunk_shape)# 使用示例shape(10000,5000,3)dtypenp.float32 optimal_chunkscalculate_optimal_chunk_size(shape,dtype)print(f推荐的块大小:{optimal_chunks})withh5py.File(optimal_chunks.h5,w)asf:datanp.random.randn(*shape).astype(dtype)f.create_dataset(data,datadata,chunksoptimal_chunks,compressiongzip,compression_opts4)7.4 Shuffle过滤器withh5py.File(shuffle_test.h5,w)asf:# 创建具有相关性的数据更容易压缩datanp.arange(100000,dtypefloat32).reshape(1000,100)datanp.random.randn(1000,100)*0.1# 不使用shufflef.create_dataset(without_shuffle,datadata,compressiongzip,compression_opts9,shuffleFalse)# 使用shuffle通常能提高压缩率f.create_dataset(with_shuffle,datadata,compressiongzip,compression_opts9,shuffleTrue)# 比较压缩效果withh5py.File(shuffle_test.h5,r)asf:size_withoutf[without_shuffle].id.get_storage_size()size_withf[with_shuffle].id.get_storage_size()print(f不使用shuffle:{size_without/1024:.2f}KB)print(f使用shuffle:{size_with/1024:.2f}KB)print(f压缩率提升:{(1-size_with/size_without)*100:.1f}%)8. 可扩展数据集8.1 一维可扩展数据集withh5py.File(resizable.h5,w)asf:# 创建可扩展数据集dsetf.create_dataset(expandable_1d,shape(100,),maxshape(None,),# 可无限扩展dtypefloat32,chunks(100,))# 初始数据dset[:]np.random.randn(100)# 追加数据withh5py.File(resizable.h5,a)asf:dsetf[expandable_1d]# 扩展数据集old_sizedset.shape[0]new_datanp.random.randn(50)dset.resize(old_size50,axis0)dset[old_size:]new_dataprint(f新大小:{dset.shape})8.2 多维可扩展数据集withh5py.File(resizable.h5,w)asf:# 创建2D可扩展数据集dsetf.create_dataset(expandable_2d,shape(100,50),maxshape(None,50),# 只在第一维可扩展dtypefloat32,chunks(10,50))dset[:]np.random.randn(100,50)# 追加行withh5py.File(resizable.h5,a)asf:dsetf[expandable_2d]old_rowsdset.shape[0]new_rows20dset.resize(old_rowsnew_rows,axis0)dset[old_rows:,:]np.random.randn(new_rows,50)8.3 流式数据写入defstream_data_writer(filename,chunk_size1000): 模拟流式数据写入 withh5py.File(filename,w)asf:# 创建可扩展数据集dsetf.create_dataset(streaming_data,shape(0,100),maxshape(None,100),chunks(chunk_size,100),dtypefloat32)# 模拟连续数据流foriinrange(10):# 10批数据# 生成新数据new_datanp.random.randn(chunk_size,100)# 扩展并写入old_sizedset.shape[0]dset.resize(old_sizechunk_size,axis0)dset[old_size:,:]new_dataprint(f批次{i1}: 累计大小 {dset.shape})# 使用stream_data_writer(streaming.h5)8.4 时间序列数据追加fromdatetimeimportdatetime,timedeltawithh5py.File(timeseries.h5,w)asf:# 创建时间戳数据集dt_typeh5py.string_dtype(encodingutf-8)timestampsf.create_dataset(timestamps,shape(0,),maxshape(None,),dtypedt_type,chunks(1000,))# 创建数值数据集valuesf.create_dataset(values,shape(0,10),maxshape(None,10),chunks(1000,10),dtypefloat32)# 初始化start_timedatetime.now()foriinrange(5):# 生成新时间戳current_timestart_timetimedelta(secondsi)timestamp_strcurrent_time.isoformat()# 生成新数据new_valuenp.random.randn(1,10)# 追加old_sizevalues.shape[0]timestamps.resize(old_size1,axis0)values.resize(old_size1,axis0)timestamps[old_size]timestamp_str values[old_size,:]new_value# 读取时间序列withh5py.File(timeseries.h5,r)asf:tsf[timestamps][:]valsf[values][:]fort,vinzip(ts[:5],vals[:5]):print(f{t}:{v})9. 维度标签9.1 创建维度标签withh5py.File(dimensions.h5,w)asf:# 创建数据集datanp.random.randn(100,64,64,3)dsetf.create_dataset(video,datadata)# 创建维度标签数据集# 维度0: 时间time_scalef.create_dataset(time,datanp.arange(100))time_scale.attrs[units]frames# 维度1和2: 空间坐标y_coordsf.create_dataset(y_coords,datanp.arange(64))x_coordsf.create_dataset(x_coords,datanp.arange(64))# 维度3: 颜色通道channelsf.create_dataset(channels,data[bR,bG,bB])# 附加维度标签dset.dims[0].labeltimedset.dims[1].labelydset.dims[2].labelxdset.dims[3].labelchannel# 附加维度标度dimension scalesdset.dims[0].attach_scale(time_scale)dset.dims[1].attach_scale(y_coords)dset.dims[2].attach_scale(x_coords)dset.dims[3].attach_scale(channels)# 读取维度信息withh5py.File(dimensions.h5,r)asf:dsetf[video]print(维度信息:)fori,diminenumerate(dset.dims):print(f 维度{i}:{dim.label})# 获取维度标度iflen(dim)0:scaledim[0]print(f 标度:{scale.name})print(f 值:{scale[:5]}...)# 显示前5个9.2 多个维度标度withh5py.File(multi_scale.h5,w)asf:# 创建数据集datanp.random.randn(1000,100)dsetf.create_dataset(measurements,datadata)# 为第一维创建多个标度# 标度1: 采样点索引indicesf.create_dataset(sample_indices,datanp.arange(1000))# 标度2: 时间秒time_secondsf.create_dataset(time_seconds,datanp.arange(1000)*0.001)# 标度3: 时间戳dt_typeh5py.string_dtype(encodingutf-8)timestamps[]start_timedatetime(2024,1,1,0,0,0)foriinrange(1000):tsstart_timetimedelta(millisecondsi)timestamps.append(ts.isoformat())f.create_dataset(timestamps,datatimestamps,dtypedt_type)# 附加所有标度dset.dims[0].attach_scale(indices)dset.dims[0].attach_scale(time_seconds)dset.dims[0].attach_scale(f[timestamps])# 为第二维创建标度channel_names[fChannel_{i}.encode()foriinrange(100)]channelsf.create_dataset(channel_names,datachannel_names)dset.dims[1].attach_scale(channels)# 使用维度标度withh5py.File(multi_scale.h5,r)asf:dsetf[measurements]print(第一维的标度:)forscaleindset.dims[0]:print(f{scale.name}:{scale[:3]}...)10. 完整示例代码10.1 创建综合示例文件创建一个包含所有HDF5特性的综合示例文件 importh5pyimportnumpyasnpfromdatetimeimportdatetimedefcreate_comprehensive_h5(filenamecomprehensive.h5):withh5py.File(filename,w)asf:# # 1. 文件级元数据# f.attrs[title]Comprehensive HDF5 Examplef.attrs[author]Josh Wittf.attrs[created]datetime.now().isoformat()f.attrs[version]1.0f.attrs[description]Contains all HDF5 data types and features# # 2. 基础数值数据# basic_groupf.create_group(basic_types)# 各种数值类型basic_group.create_dataset(int32,datanp.arange(100,dtypei4))basic_group.create_dataset(float64,datanp.random.randn(100))basic_group.create_dataset(complex128,datanp.random.randn(50)1j*np.random.randn(50))basic_group.create_dataset(bool,datanp.random.rand(100)0.5)# 多维数组basic_group.create_dataset(matrix_2d,datanp.random.randn(100,50))basic_group.create_dataset(tensor_3d,datanp.random.randn(10,20,30))basic_group.create_dataset(tensor_4d,datanp.random.randn(5,10,20,3))# # 3. 字符串数据# string_groupf.create_group(strings)# 固定长度string_group.create_dataset(fixed_ascii,datanp.array([bhello,bworld],dtypeS10))# 变长字符串vlen_strh5py.string_dtype(encodingutf-8)string_group.create_dataset(variable_utf8,data[短,这是一个很长的字符串,中],dtypevlen_str)# # 4. 复合数据类型# compound_groupf.create_group(compound_types)# 简单结构体person_dtnp.dtype([(name,S50),(age,i4),(salary,f8)])person_datanp.array([(bAlice,25,75000.0),(bBob,30,85000.0),(bCharlie,28,80000.0)],dtypeperson_dt)compound_group.create_dataset(people,dataperson_data)# 嵌套结构体nested_dtnp.dtype([(id,i4),(measurements,f4,(5,)),# 固定长度数组(valid,?)])nested_datanp.array([(1,[1.1,2.2,3.3,4.4,5.5],True),(2,[6.6,7.7,8.8,9.9,10.0],False)],dtypenested_dt)compound_group.create_dataset(nested,datanested_data)# # 5. 压缩数据# compression_groupf.create_group(compressed)test_datanp.random.randn(1000,1000).astype(float32)# 不同压缩方法compression_group.create_dataset(gzip_level_1,datatest_data,compressiongzip,compression_opts1)compression_group.create_dataset(gzip_level_9,datatest_data,compressiongzip,compression_opts9,shuffleTrue)compression_group.create_dataset(lzf,datatest_data,compressionlzf)# # 6. 可扩展数据集# expandable_groupf.create_group(expandable)# 1D可扩展exp_1dexpandable_group.create_dataset(data_1d,shape(100,),maxshape(None,),chunks(100,),dtypefloat32)exp_1d[:]np.random.randn(100)# 2D可扩展exp_2dexpandable_group.create_dataset(data_2d,shape(100,50),maxshape(None,50),chunks(100,50),dtypefloat32)exp_2d[:]np.random.randn(100,50)# # 7. 引用和链接# reference_groupf.create_group(references)# 创建被引用的数据target_datanp.arange(100)targetf.create_dataset(target_dataset,datatarget_data)# 软链接reference_group[soft_link]h5py.SoftLink(/target_dataset)# 对象引用reftarget.ref reference_group.create_dataset(object_ref,dataref)# # 8. 属性示例# attr_groupf.create_group(attributes_example)# 数据集with丰富的属性sensor_datanp.random.randn(1000,10)sensor_dsetattr_group.create_dataset(sensor_readings,datasensor_data)# 各种类型的属性sensor_dset.attrs[units]meters/secondsensor_dset.attrs[sampling_rate]1000.0sensor_dset.attrs[calibrated]Truesensor_dset.attrs[sensor_ids][1,2,3,4,5,6,7,8,9,10]sensor_dset.attrs[date_collected]datetime.now().isoformat()sensor_dset.attrs[location]Lab Building A, Room 101sensor_dset.attrs[temperature]23.5sensor_dset.attrs[humidity]45.2# # 9. 维度标签# dims_groupf.create_group(with_dimensions)# 3D数据with维度标签volume_datanp.random.randn(50,100,100)volumedims_group.create_dataset(volume,datavolume_data)# 创建维度标度z_coordsdims_group.create_dataset(z,datanp.arange(50))y_coordsdims_group.create_dataset(y,datanp.arange(100))x_coordsdims_group.create_dataset(x,datanp.arange(100))# 附加维度volume.dims[0].labelzvolume.dims[1].labelyvolume.dims[2].labelxvolume.dims[0].attach_scale(z_coords)volume.dims[1].attach_scale(y_coords)volume.dims[2].attach_scale(x_coords)# # 10. 实际应用示例神经网络权重# nn_groupf.create_group(neural_network)# 模拟神经网络层layers{layer1:{weights:np.random.randn(784,128),biases:np.zeros(128)},layer2:{weights:np.random.randn(128,64),biases:np.zeros(64)},layer3:{weights:np.random.randn(64,10),biases:np.zeros(10)}}forlayer_name,paramsinlayers.items():layer_groupnn_group.create_group(layer_name)forparam_name,param_valueinparams.items():dsetlayer_group.create_dataset(param_name,dataparam_value)dset.attrs[trainable]Truedset.attrs[dtype]str(param_value.dtype)nn_group.attrs[architecture]feedforwardnn_group.attrs[input_size]784nn_group.attrs[output_size]10# # 11. 时间序列数据# ts_groupf.create_group(time_series)# 生成时间戳num_samples1000timestamps[]start_timedatetime(2024,1,1,0,0,0)foriinrange(num_samples):tsstart_timetimedelta(secondsi)timestamps.append(ts.isoformat())# 存储时间戳dt_typeh5py.string_dtype(encodingutf-8)ts_group.create_dataset(timestamps,datatimestamps,dtypedt_type)# 存储对应的数值ts_group.create_dataset(values,datanp.random.randn(num_samples,5))# # 12. 大数据集with优化# large_groupf.create_group(large_data)large_datanp.random.randn(10000,1000).astype(float32)large_dsetlarge_group.create_dataset(optimized,datalarge_data,chunks(1000,100),compressiongzip,compression_opts4,shuffleTrue)large_dset.attrs[chunk_strategy]optimized for row accesslarge_dset.attrs[compression_ratio]f{large_data.nbytes/large_dset.id.get_storage_size():.2f}xprint(f文件 {filename} 创建完成)# 创建文件create_comprehensive_h5()10.2 读取和分析综合示例文件读取并分析综合示例文件 defanalyze_h5_file(filenamecomprehensive.h5):print(f分析文件:{filename})print(*80)withh5py.File(filename,r)asf:# 显示文件属性print(\n【文件属性】)forkey,valueinf.attrs.items():print(f{key}:{value})# 递归显示结构print(\n【文件结构】)defprint_tree(name,obj,level0):indent *levelifisinstance(obj,h5py.Group):print(f{indent}{name}/)# 显示组属性iflen(obj.attrs)0:forkeyinobj.attrs.keys():print(f{indent}{key}:{obj.attrs[key]})elifisinstance(obj,h5py.Dataset):size_mbobj.nbytes/(1024*1024)storage_mbobj.id.get_storage_size()/(1024*1024)compressionobj.compressionornoneprint(f{indent}{name})print(f{indent}形状:{obj.shape}, 类型:{obj.dtype})print(f{indent}大小:{size_mb:.2f}MB, 存储:{storage_mb:.2f}MB)print(f{indent}压缩:{compression})ifobj.chunks:print(f{indent}分块:{obj.chunks})# 显示数据集属性iflen(obj.attrs)0:print(f{indent}属性:)forkey,valueinobj.attrs.items():print(f{indent}{key}:{value})f.visititems(lambdan,o:print_tree(n,o,n.count(/)))# 统计信息print(\n【统计信息】)defcount_items(group):groups0datasets0total_size0defcount(name,obj):nonlocalgroups,datasets,total_sizeifisinstance(obj,h5py.Group):groups1elifisinstance(obj,h5py.Dataset):datasets1total_sizeobj.nbytes group.visititems(count)returngroups,datasets,total_size num_groups,num_datasets,total_sizecount_items(f)file_sizeos.path.getsize(filename)print(f 组数量:{num_groups})print(f 数据集数量:{num_datasets})print(f 原始数据大小:{total_size/(1024*1024):.2f}MB)print(f 文件大小:{file_size/(1024*1024):.2f}MB)print(f 总压缩率:{total_size/file_size:.2f}x)# 示例读取特定数据print(\n【示例数据读取】)# 读取基础类型ifbasic_types/float64inf:dataf[basic_types/float64][:10]print(f float64前10个值:{data})# 读取结构体ifcompound_types/peopleinf:peoplef[compound_types/people][:]print(f 人员记录:)forpersoninpeople:print(f{person[name].decode()}: 年龄{person[age]}, 工资${person[salary]})# 读取神经网络权重ifneural_networkinf:print(f 神经网络架构:{f[neural_network].attrs[architecture]})print(f 层:)forlayer_nameinf[neural_network].keys():layerf[neural_network][layer_name]weights_shapelayer[weights].shapeprint(f{layer_name}:{weights_shape})# 运行分析if__name____main__:create_comprehensive_h5()analyze_h5_file()10.3 实用工具函数集 HDF5实用工具函数集 classHDF5Utils:HDF5工具类staticmethoddefget_file_info(filename):获取文件基本信息withh5py.File(filename,r)asf:info{filename:filename,file_size_mb:os.path.getsize(filename)/(1024*1024),num_groups:0,num_datasets:0,total_data_size_mb:0}defcount(name,obj):ifisinstance(obj,h5py.Group):info[num_groups]1elifisinstance(obj,h5py.Dataset):info[num_datasets]1info[total_data_size_mb]obj.nbytes/(1024*1024)f.visititems(count)# 文件属性info[attributes]dict(f.attrs)returninfostaticmethoddeffind_large_datasets(filename,threshold_mb10):查找大于阈值的数据集large_datasets[]withh5py.File(filename,r)asf:defcheck_size(name,obj):ifisinstance(obj,h5py.Dataset):size_mbobj.nbytes/(1024*1024)ifsize_mbthreshold_mb:large_datasets.append({name:name,size_mb:size_mb,shape:obj.shape,dtype:str(obj.dtype)})f.visititems(check_size)returnsorted(large_datasets,keylambdax:x[size_mb],reverseTrue)staticmethoddefcopy_dataset(src_file,src_path,dst_file,dst_pathNone):复制数据集到另一个文件ifdst_pathisNone:dst_pathsrc_pathwithh5py.File(src_file,r)asf_src:withh5py.File(dst_file,a)asf_dst:f_src.copy(src_path,f_dst,namedst_path)staticmethoddefexport_to_dict(filename,path/):将HDF5导出为嵌套字典result{}withh5py.File(filename,r)asf:defbuild_dict(name,obj):partsname.split(/)currentresultforpartinparts[:-1]:ifpartnotincurrent:current[part]{}currentcurrent[part]ifisinstance(obj,h5py.Dataset):current[parts[-1]]obj[:]elifisinstance(obj,h5py.Group):ifparts[-1]notincurrent:current[parts[-1]]{}f.visititems(build_dict)returnresultstaticmethoddefget_compression_stats(filename):获取压缩统计信息stats[]withh5py.File(filename,r)asf:defanalyze_compression(name,obj):ifisinstance(obj,h5py.Dataset):original_sizeobj.nbytes storage_sizeobj.id.get_storage_size()stats.append({name:name,compression:obj.compressionornone,original_mb:original_size/(1024*1024),storage_mb:storage_size/(1024*1024),ratio:original_size/storage_sizeifstorage_size0else1.0})f.visititems(analyze_compression)returnstatsstaticmethoddefvalidate_file(filename):验证HDF5文件完整性try:withh5py.File(filename,r)asf:# 尝试访问所有数据集errors[]defvalidate_dataset(name,obj):ifisinstance(obj,h5py.Dataset):try:# 尝试读取第一个元素ifobj.size0:_obj.flat[0]exceptExceptionase:errors.append(f{name}:{str(e)})f.visititems(validate_dataset)iferrors:returnFalse,errorselse:returnTrue,[文件验证通过]exceptExceptionase:returnFalse,[f无法打开文件:{str(e)}]# 使用示例if__name____main__:utilsHDF5Utils()# 获取文件信息infoutils.get_file_info(comprehensive.h5)print(文件信息:,info)# 查找大数据集largeutils.find_large_datasets(comprehensive.h5,threshold_mb1)print(\n大数据集:,large)# 获取压缩统计comp_statsutils.get_compression_stats(comprehensive.h5)print(\n压缩统计:)forstatincomp_stats[:5]:# 只显示前5个print(f{stat[name]}:{stat[ratio]:.2f}x ({stat[compression]}))# 验证文件valid,messagesutils.validate_file(comprehensive.h5)print(f\n文件验证:{通过ifvalidelse失败})formsginmessages:print(f{msg})总结这份指南涵盖了HDF5的所有主要特性基础数据类型- 数值、字符串、布尔等组结构- 层次化组织数据属性系统- 元数据管理高级数据类型- 复合类型、枚举、变长数据引用和链接- 软链接、硬链接、对象引用压缩和分块- 优化存储和访问可扩展数据集- 动态增长的数据维度标签- 为数据添加物理意义完整示例- 实际应用代码

中国建设网站下载个人网站特点

网站建设售后服务内容网站建设所需要的东西

建设企业网站需要哪些东西武昌网站建设

那个网站攻略做的好大连网站建设lccm

福州品牌网站设计w78企业网站后台怎么做

福州网站开发风格孝感市门户网站

网站图片设置4:3360网站关键词排名优化