From feedb550c846ec6b70ce929e587543dece65ed6e Mon Sep 17 00:00:00 2001 From: Joseph Musgrove <117590282+JoeDediop@users.noreply.github.com> Date: Wed, 6 Aug 2025 09:41:47 -0700 Subject: [PATCH 1/3] Update put() and append() docstrings --- pandas/io/pytables.py | 79 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 75 insertions(+), 4 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 64a05c87e0f80..2b01743ec2470 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -1170,8 +1170,17 @@ def put( complevel : int, 0-9, default None Specifies a compression level for data. A value of 0 or None disables compression. - min_itemsize : int, dict, or None - Dict of columns that specify minimum str sizes. + min_itemsize : int, dict of str: int, or None, default None + Minimum size in bytes for string columns. This parameter is only used when + format='table'. Can be: + - int: Apply the same minimum size to all string columns + - dict: Map column names to their minimum sizes + - None: Use default sizing + **Important**: The size refers to the number of bytes after encoding, not + the number of characters. For multi-byte characters (e.g., Chinese, Arabic), + you need to account for the encoding. For example, the character '香' is + 1 character but 3 bytes when encoded as UTF-8 + See examples below for proper usage with encoded strings. nan_rep : str Str to use as str nan representation. data_columns : list of columns or True, default None @@ -1203,6 +1212,23 @@ def put( >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) >>> store = pd.HDFStore("store.h5", "w") # doctest: +SKIP >>> store.put("data", df) # doctest: +SKIP + + Basic usage with ASCII strings: + >>> df = pd.DataFrame([['hello', 'world']], columns=['A', 'B']) + >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP + >>> store.put('data', df, format='table', min_itemsize={'A': 10, 'B': 10}) # doctest: +SKIP + Usage with multi-byte characters: + >>> df_unicode = pd.DataFrame([['香港', '北京']], columns=['city1', 'city2']) # doctest: +SKIP + >>> # Each Chinese character is 3 bytes in UTF-8, so '香港' needs 6 bytes + >>> store.put('cities', df_unicode, format='table', # doctest: +SKIP + ... min_itemsize={'city1': 12, 'city2': 12}, encoding='utf-8') # doctest: +SKIP + Determining the correct size for encoded strings: + >>> text = '香港' # doctest: +SKIP + >>> len(text) # Character length # doctest: +SKIP + 2 + >>> len(text.encode('utf-8')) # Byte length # doctest: +SKIP + 6 + >>> # Use the byte length for min_itemsize """ if format is None: format = get_option("io.hdf.default_format") or "fixed" @@ -1330,8 +1356,17 @@ def append( A value of 0 or None disables compression. columns : default None This parameter is currently not accepted, try data_columns. - min_itemsize : int, dict, or None - Dict of columns that specify minimum str sizes. + min_itemsize : int, dict of str: int, or None, default None + Minimum size in bytes for string columns. Can be: + - int: Apply the same minimum size to all string columns + - dict: Map column names to their minimum sizes + - None: Use the existing table's column sizes + **Important**: This parameter is only effective when creating a new table. + If the table already exists, the column sizes are fixed and cannot be + changed. The size refers to the number of bytes after encoding, not + the number of characters. + For multi-byte characters, calculate the size using the encoded byte length. + For example: len('香'.encode('utf-8')) returns 3, not len('香') which returns 1. nan_rep : str Str to use as str nan representation. chunksize : int or None @@ -1364,6 +1399,10 @@ def append( Does *not* check if data being appended overlaps with existing data in the table, so be careful + When appending to an existing table, the min_itemsize parameter has no effect + as column sizes are already fixed. Set min_itemsize when initially creating + the table with put() or the first append() call. + Examples -------- >>> df1 = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) @@ -1377,6 +1416,38 @@ def append( 1 3 4 0 5 6 1 7 8 + + Creating a table and appending data: + + >>> df1 = pd.DataFrame([['short', 'text']], columns=['A', 'B']) + >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP + >>> # Set min_itemsize when creating the table + >>> store.put('data', df1, format='table', min_itemsize={'A': 20, 'B': 20}) # doctest: +SKIP + >>> + >>> df2 = pd.DataFrame([['longer text here', 'more text']], columns=['A', 'B']) + >>> store.append('data', df2) # doctest: +SKIP + >>> store.close() # doctest: +SKIP + + Handling multi-byte characters: + + >>> df_en = pd.DataFrame([['hello']], columns=['text']) + >>> df_zh = pd.DataFrame([['你好世界']], columns=['text']) # "Hello World" in Chinese + >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP + >>> # Calculate size needed: len('你好世界'.encode('utf-8')) = 12 bytes + >>> store.put('messages', df_en, format='table', + ... min_itemsize={'text': 15}, encoding='utf-8') # doctest: +SKIP + >>> store.append('messages', df_zh) # doctest: +SKIP + >>> store.close() # doctest: +SKIP + + Common error when min_itemsize is too small: + + >>> df = pd.DataFrame([['香']], columns=['char']) # 3 bytes in UTF-8 + >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP + >>> # This will raise ValueError: string length [3] exceeds limit [1] + >>> # store.put('test', df, format='table', min_itemsize={'char': 1}) + >>> # Correct usage: + >>> store.put('test', df, format='table', min_itemsize={'char': 3}) # doctest: +SKIP + >>> store.close() # doctest: +SKIP """ if columns is not None: raise TypeError( From 769e23f0d0d6ff7d0c6ddf36cd320aa0e3261780 Mon Sep 17 00:00:00 2001 From: Joseph Musgrove <117590282+JoeDediop@users.noreply.github.com> Date: Wed, 6 Aug 2025 09:49:29 -0700 Subject: [PATCH 2/3] Update pytables.py --- pandas/io/pytables.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 2b01743ec2470..5b5dd221bfc4f 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -1170,7 +1170,7 @@ def put( complevel : int, 0-9, default None Specifies a compression level for data. A value of 0 or None disables compression. - min_itemsize : int, dict of str: int, or None, default None + min_itemsize : int, dict of str: int, or None, default None Minimum size in bytes for string columns. This parameter is only used when format='table'. Can be: - int: Apply the same minimum size to all string columns @@ -1213,7 +1213,7 @@ def put( >>> store = pd.HDFStore("store.h5", "w") # doctest: +SKIP >>> store.put("data", df) # doctest: +SKIP - Basic usage with ASCII strings: + Basic usage with ASCII strings: >>> df = pd.DataFrame([['hello', 'world']], columns=['A', 'B']) >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP >>> store.put('data', df, format='table', min_itemsize={'A': 10, 'B': 10}) # doctest: +SKIP From 32b4aa6c7940020c1c63ad4fd1eae5266c842113 Mon Sep 17 00:00:00 2001 From: Joseph Musgrove <117590282+JoeDediop@users.noreply.github.com> Date: Wed, 6 Aug 2025 10:30:59 -0700 Subject: [PATCH 3/3] Update pytables.py --- pandas/io/pytables.py | 93 +++++++++++-------------------------------- 1 file changed, 24 insertions(+), 69 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 5b5dd221bfc4f..3c328939955d9 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -1169,18 +1169,16 @@ def put( This parameter is currently not accepted. complevel : int, 0-9, default None Specifies a compression level for data. - A value of 0 or None disables compression. + A value of 0 or None disables compression. min_itemsize : int, dict of str: int, or None, default None - Minimum size in bytes for string columns. This parameter is only used when - format='table'. Can be: - - int: Apply the same minimum size to all string columns - - dict: Map column names to their minimum sizes - - None: Use default sizing - **Important**: The size refers to the number of bytes after encoding, not - the number of characters. For multi-byte characters (e.g., Chinese, Arabic), - you need to account for the encoding. For example, the character '香' is - 1 character but 3 bytes when encoded as UTF-8 - See examples below for proper usage with encoded strings. + Minimum size in bytes for string columns when format = 'table'. + int - Apply the same minimum size to all string columns, + dict - Map column names to their minimum sizes or, + None - use the default the sizing + Important: This specifies the byte length after encoding, not the + character count. For multi-byte characters, calculate the required + size using the encoded byte length. + See examples below for use. nan_rep : str Str to use as str nan representation. data_columns : list of columns or True, default None @@ -1213,22 +1211,9 @@ def put( >>> store = pd.HDFStore("store.h5", "w") # doctest: +SKIP >>> store.put("data", df) # doctest: +SKIP - Basic usage with ASCII strings: - >>> df = pd.DataFrame([['hello', 'world']], columns=['A', 'B']) - >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP - >>> store.put('data', df, format='table', min_itemsize={'A': 10, 'B': 10}) # doctest: +SKIP - Usage with multi-byte characters: - >>> df_unicode = pd.DataFrame([['香港', '北京']], columns=['city1', 'city2']) # doctest: +SKIP - >>> # Each Chinese character is 3 bytes in UTF-8, so '香港' needs 6 bytes - >>> store.put('cities', df_unicode, format='table', # doctest: +SKIP - ... min_itemsize={'city1': 12, 'city2': 12}, encoding='utf-8') # doctest: +SKIP - Determining the correct size for encoded strings: - >>> text = '香港' # doctest: +SKIP - >>> len(text) # Character length # doctest: +SKIP - 2 - >>> len(text.encode('utf-8')) # Byte length # doctest: +SKIP - 6 - >>> # Use the byte length for min_itemsize + >>> ASCII 'hello' = 5 bytes + >>> UTF-8 '香' = 3 bytes (though only 1 character) + >>> To find byte length: len(string.encode('utf-8')) """ if format is None: format = get_option("io.hdf.default_format") or "fixed" @@ -1355,18 +1340,16 @@ def append( Specifies a compression level for data. A value of 0 or None disables compression. columns : default None - This parameter is currently not accepted, try data_columns. + This parameter is currently not accepted, try data_columns. min_itemsize : int, dict of str: int, or None, default None - Minimum size in bytes for string columns. Can be: - - int: Apply the same minimum size to all string columns - - dict: Map column names to their minimum sizes - - None: Use the existing table's column sizes - **Important**: This parameter is only effective when creating a new table. - If the table already exists, the column sizes are fixed and cannot be - changed. The size refers to the number of bytes after encoding, not - the number of characters. - For multi-byte characters, calculate the size using the encoded byte length. - For example: len('香'.encode('utf-8')) returns 3, not len('香') which returns 1. + Minimum size in bytes for string columns when format = 'table'. + int - Apply the same minimum size to all string columns, + dict - Map column names to their minimum sizes or, + None - use the default the sizing + Important: This specifies the byte length after encoding, not the + character count. For multi-byte characters, calculate the required + size using the encoded byte length. + See examples below for use. nan_rep : str Str to use as str nan representation. chunksize : int or None @@ -1417,37 +1400,9 @@ def append( 0 5 6 1 7 8 - Creating a table and appending data: - - >>> df1 = pd.DataFrame([['short', 'text']], columns=['A', 'B']) - >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP - >>> # Set min_itemsize when creating the table - >>> store.put('data', df1, format='table', min_itemsize={'A': 20, 'B': 20}) # doctest: +SKIP - >>> - >>> df2 = pd.DataFrame([['longer text here', 'more text']], columns=['A', 'B']) - >>> store.append('data', df2) # doctest: +SKIP - >>> store.close() # doctest: +SKIP - - Handling multi-byte characters: - - >>> df_en = pd.DataFrame([['hello']], columns=['text']) - >>> df_zh = pd.DataFrame([['你好世界']], columns=['text']) # "Hello World" in Chinese - >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP - >>> # Calculate size needed: len('你好世界'.encode('utf-8')) = 12 bytes - >>> store.put('messages', df_en, format='table', - ... min_itemsize={'text': 15}, encoding='utf-8') # doctest: +SKIP - >>> store.append('messages', df_zh) # doctest: +SKIP - >>> store.close() # doctest: +SKIP - - Common error when min_itemsize is too small: - - >>> df = pd.DataFrame([['香']], columns=['char']) # 3 bytes in UTF-8 - >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP - >>> # This will raise ValueError: string length [3] exceeds limit [1] - >>> # store.put('test', df, format='table', min_itemsize={'char': 1}) - >>> # Correct usage: - >>> store.put('test', df, format='table', min_itemsize={'char': 3}) # doctest: +SKIP - >>> store.close() # doctest: +SKIP + >>> ASCII 'hello' = 5 bytes + >>> UTF-8 '香' = 3 bytes (though only 1 character) + >>> To find byte length: len(string.encode('utf-8')) """ if columns is not None: raise TypeError(