@@ -1170,8 +1170,17 @@ def put(
11701170 complevel : int, 0-9, default None
11711171 Specifies a compression level for data.
11721172 A value of 0 or None disables compression.
1173- min_itemsize : int, dict, or None
1174- Dict of columns that specify minimum str sizes.
1173+ min_itemsize : int, dict of str: int, or None, default None
1174+ Minimum size in bytes for string columns. This parameter is only used when
1175+ format='table'. Can be:
1176+ - int: Apply the same minimum size to all string columns
1177+ - dict: Map column names to their minimum sizes
1178+ - None: Use default sizing
1179+ **Important**: The size refers to the number of bytes after encoding, not
1180+ the number of characters. For multi-byte characters (e.g., Chinese, Arabic),
1181+ you need to account for the encoding. For example, the character '香' is
1182+ 1 character but 3 bytes when encoded as UTF-8
1183+ See examples below for proper usage with encoded strings.
11751184 nan_rep : str
11761185 Str to use as str nan representation.
11771186 data_columns : list of columns or True, default None
@@ -1203,6 +1212,23 @@ def put(
12031212 >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"])
12041213 >>> store = pd.HDFStore("store.h5", "w") # doctest: +SKIP
12051214 >>> store.put("data", df) # doctest: +SKIP
1215+
1216+ Basic usage with ASCII strings:
1217+ >>> df = pd.DataFrame([['hello', 'world']], columns=['A', 'B'])
1218+ >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP
1219+ >>> store.put('data', df, format='table', min_itemsize={'A': 10, 'B': 10}) # doctest: +SKIP
1220+ Usage with multi-byte characters:
1221+ >>> df_unicode = pd.DataFrame([['香港', '北京']], columns=['city1', 'city2']) # doctest: +SKIP
1222+ >>> # Each Chinese character is 3 bytes in UTF-8, so '香港' needs 6 bytes
1223+ >>> store.put('cities', df_unicode, format='table', # doctest: +SKIP
1224+ ... min_itemsize={'city1': 12, 'city2': 12}, encoding='utf-8') # doctest: +SKIP
1225+ Determining the correct size for encoded strings:
1226+ >>> text = '香港' # doctest: +SKIP
1227+ >>> len(text) # Character length # doctest: +SKIP
1228+ 2
1229+ >>> len(text.encode('utf-8')) # Byte length # doctest: +SKIP
1230+ 6
1231+ >>> # Use the byte length for min_itemsize
12061232 """
12071233 if format is None :
12081234 format = get_option ("io.hdf.default_format" ) or "fixed"
@@ -1330,8 +1356,17 @@ def append(
13301356 A value of 0 or None disables compression.
13311357 columns : default None
13321358 This parameter is currently not accepted, try data_columns.
1333- min_itemsize : int, dict, or None
1334- Dict of columns that specify minimum str sizes.
1359+ min_itemsize : int, dict of str: int, or None, default None
1360+ Minimum size in bytes for string columns. Can be:
1361+ - int: Apply the same minimum size to all string columns
1362+ - dict: Map column names to their minimum sizes
1363+ - None: Use the existing table's column sizes
1364+ **Important**: This parameter is only effective when creating a new table.
1365+ If the table already exists, the column sizes are fixed and cannot be
1366+ changed. The size refers to the number of bytes after encoding, not
1367+ the number of characters.
1368+ For multi-byte characters, calculate the size using the encoded byte length.
1369+ For example: len('香'.encode('utf-8')) returns 3, not len('香') which returns 1.
13351370 nan_rep : str
13361371 Str to use as str nan representation.
13371372 chunksize : int or None
@@ -1364,6 +1399,10 @@ def append(
13641399 Does *not* check if data being appended overlaps with existing
13651400 data in the table, so be careful
13661401
1402+ When appending to an existing table, the min_itemsize parameter has no effect
1403+ as column sizes are already fixed. Set min_itemsize when initially creating
1404+ the table with put() or the first append() call.
1405+
13671406 Examples
13681407 --------
13691408 >>> df1 = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"])
@@ -1377,6 +1416,38 @@ def append(
13771416 1 3 4
13781417 0 5 6
13791418 1 7 8
1419+
1420+ Creating a table and appending data:
1421+
1422+ >>> df1 = pd.DataFrame([['short', 'text']], columns=['A', 'B'])
1423+ >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP
1424+ >>> # Set min_itemsize when creating the table
1425+ >>> store.put('data', df1, format='table', min_itemsize={'A': 20, 'B': 20}) # doctest: +SKIP
1426+ >>>
1427+ >>> df2 = pd.DataFrame([['longer text here', 'more text']], columns=['A', 'B'])
1428+ >>> store.append('data', df2) # doctest: +SKIP
1429+ >>> store.close() # doctest: +SKIP
1430+
1431+ Handling multi-byte characters:
1432+
1433+ >>> df_en = pd.DataFrame([['hello']], columns=['text'])
1434+ >>> df_zh = pd.DataFrame([['你好世界']], columns=['text']) # "Hello World" in Chinese
1435+ >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP
1436+ >>> # Calculate size needed: len('你好世界'.encode('utf-8')) = 12 bytes
1437+ >>> store.put('messages', df_en, format='table',
1438+ ... min_itemsize={'text': 15}, encoding='utf-8') # doctest: +SKIP
1439+ >>> store.append('messages', df_zh) # doctest: +SKIP
1440+ >>> store.close() # doctest: +SKIP
1441+
1442+ Common error when min_itemsize is too small:
1443+
1444+ >>> df = pd.DataFrame([['香']], columns=['char']) # 3 bytes in UTF-8
1445+ >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP
1446+ >>> # This will raise ValueError: string length [3] exceeds limit [1]
1447+ >>> # store.put('test', df, format='table', min_itemsize={'char': 1})
1448+ >>> # Correct usage:
1449+ >>> store.put('test', df, format='table', min_itemsize={'char': 3}) # doctest: +SKIP
1450+ >>> store.close() # doctest: +SKIP
13801451 """
13811452 if columns is not None :
13821453 raise TypeError (
0 commit comments