How to Get the Uncompressed and Compressed File Size in Python
Knowing the size of files, both before and after compression, is useful for storage management, bandwidth estimation, monitoring compression ratios, and building file management tools. Python provides built-in modules to work with various compressed formats and retrieve file sizes easily. In this guide, we'll show you how to get both uncompressed and compressed file sizes for different archive formats.
Understanding Compressed vs. Uncompressed Size
- Uncompressed size: The total size of the original file(s) before compression.
- Compressed size: The size of the file after compression has been applied.
- Compression ratio: How much space was saved:
1 - (compressed / uncompressed).
For archive files like ZIP, the compressed file on disk may be smaller than the original data it contains. The archive stores metadata about both sizes.
Method 1: Using os and zipfile (ZIP Files)
The zipfile module (built-in) lets you inspect ZIP archives and get both the compressed and uncompressed sizes stored in the archive metadata:
import os
import zipfile
def get_zip_sizes(zip_path):
"""Get uncompressed and compressed sizes from a ZIP file."""
# Size of the ZIP file on disk
archive_size = os.path.getsize(zip_path)
total_uncompressed = 0
total_compressed = 0
with zipfile.ZipFile(zip_path, 'r') as zf:
for info in zf.infolist():
if not info.is_dir():
total_uncompressed += info.file_size # Original size
total_compressed += info.compress_size # Compressed size
return {
'archive_size': archive_size,
'uncompressed_total': total_uncompressed,
'compressed_total': total_compressed,
'ratio': 1 - (total_compressed / total_uncompressed) if total_uncompressed > 0 else 0
}
# Usage
zip_path = "example.zip"
sizes = get_zip_sizes(zip_path)
print(f"Archive size on disk: {sizes['archive_size']:,} bytes")
print(f"Uncompressed total: {sizes['uncompressed_total']:,} bytes")
print(f"Compressed total: {sizes['compressed_total']:,} bytes")
print(f"Compression ratio: {sizes['ratio']:.1%}")
Output:
Archive size on disk: 3,245,678 bytes
Uncompressed total: 7,909,996 bytes
Compressed total: 3,244,512 bytes
Compression ratio: 59.0%
Listing Individual File Sizes in a ZIP
import zipfile
def list_zip_contents(zip_path):
"""List each file in a ZIP with its compressed and uncompressed sizes."""
with zipfile.ZipFile(zip_path, 'r') as zf:
print(f"{'Filename':<30} {'Original':>12} {'Compressed':>12} {'Ratio':>8}")
print("-" * 65)
for info in zf.infolist():
if not info.is_dir():
ratio = 1 - (info.compress_size / info.file_size) if info.file_size > 0 else 0
print(f"{info.filename:<30} "
f"{info.file_size:>10,} B "
f"{info.compress_size:>10,} B "
f"{ratio:>7.1%}")
list_zip_contents("example.zip")
Output:
Filename Original Compressed Ratio
-----------------------------------------------------------------
data/report.csv 1,234,567 B 456,789 B 63.0%
images/photo.jpg 2,345,678 B 2,340,123 B 0.2%
documents/readme.txt 12,345 B 4,567 B 63.0%
Note that JPEG images show very low compression ratios because they are already compressed. Text files and CSV data typically compress much better.
Method 2: Using gzip (GZ Files)
For .gz files, you can get the compressed size from disk and read the uncompressed size from the GZIP header:
import os
import gzip
import struct
def get_gzip_sizes(gz_path):
"""Get sizes for a GZIP compressed file."""
# Compressed size (file on disk)
compressed_size = os.path.getsize(gz_path)
# Uncompressed size is stored in the last 4 bytes of the GZIP file
# Note: This is modulo 2^32, so it is unreliable for files > 4GB
with open(gz_path, 'rb') as f:
f.seek(-4, 2) # Seek to 4 bytes before end
uncompressed_size = struct.unpack('<I', f.read(4))[0]
return compressed_size, uncompressed_size
# Usage
gz_path = "data.csv.gz"
compressed, uncompressed = get_gzip_sizes(gz_path)
print(f"Compressed size: {compressed:,} bytes")
print(f"Uncompressed size: {uncompressed:,} bytes")
print(f"Compression ratio: {1 - compressed / uncompressed:.1%}")
Output:
Compressed size: 1,234,567 bytes
Uncompressed size: 5,678,901 bytes
Compression ratio: 78.3%
The GZIP footer stores the uncompressed size as a 32-bit integer, which means it wraps around for files larger than 4 GB. For large files, you will need to decompress and count bytes to get the accurate uncompressed size.
Accurate Method for Large GZIP Files
import gzip
import os
def get_gzip_uncompressed_size(gz_path):
"""Get accurate uncompressed size by reading the entire file."""
total = 0
with gzip.open(gz_path, 'rb') as f:
while True:
chunk = f.read(1024 * 1024) # Read 1MB at a time
if not chunk:
break
total += len(chunk)
return total
gz_path = "large_data.csv.gz"
compressed = os.path.getsize(gz_path)
uncompressed = get_gzip_uncompressed_size(gz_path)
print(f"Compressed: {compressed:,} bytes ({compressed / 1e6:.1f} MB)")
print(f"Uncompressed: {uncompressed:,} bytes ({uncompressed / 1e6:.1f} MB)")
Method 3: Using tarfile (TAR.GZ Files)
For .tar.gz (or .tgz) archives:
import os
import tarfile
def get_targz_sizes(tar_path):
"""Get sizes for a TAR.GZ archive."""
compressed_size = os.path.getsize(tar_path)
uncompressed_total = 0
with tarfile.open(tar_path, 'r:gz') as tar:
for member in tar.getmembers():
if member.isfile():
uncompressed_total += member.size
return compressed_size, uncompressed_total
# Usage
tar_path = "archive.tar.gz"
compressed, uncompressed = get_targz_sizes(tar_path)
print(f"Compressed: {compressed:,} bytes")
print(f"Uncompressed: {uncompressed:,} bytes")
Method 4: Compressing a File and Comparing Sizes
If you want to see how well a file would compress without creating a permanent compressed file, compute the compression in memory:
import os
import gzip
import zlib
def compare_compression(file_path):
"""Compare original size with various compression methods."""
original_size = os.getsize(file_path)
with open(file_path, 'rb') as f:
data = f.read()
# GZIP compression
gzip_data = gzip.compress(data)
gzip_size = len(gzip_data)
# ZLIB compression (different levels)
zlib_fast = zlib.compress(data, level=1)
zlib_best = zlib.compress(data, level=9)
print(f"Original size: {original_size:>12,} bytes")
print(f"GZIP compressed: {gzip_size:>12,} bytes ({1 - gzip_size/original_size:.1%} saved)")
print(f"ZLIB (fast, level=1): {len(zlib_fast):>12,} bytes ({1 - len(zlib_fast)/original_size:.1%} saved)")
print(f"ZLIB (best, level=9): {len(zlib_best):>12,} bytes ({1 - len(zlib_best)/original_size:.1%} saved)")
compare_compression("data.csv")
Output:
Original size: 5,678,901 bytes
GZIP compressed: 1,234,567 bytes (78.3% saved)
ZLIB (fast, level=1): 1,456,789 bytes (74.4% saved)
ZLIB (best, level=9): 1,198,765 bytes (78.9% saved)
Utility: Human-Readable File Sizes
All examples so far show sizes in bytes. Here's a helper to format sizes readably:
def format_size(size_bytes):
"""Convert bytes to a human-readable string."""
for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
if abs(size_bytes) < 1024:
return f"{size_bytes:.1f} {unit}"
size_bytes /= 1024
return f"{size_bytes:.1f} PB"
# Usage
print(format_size(1234567)) # 1.2 MB
print(format_size(7909996)) # 7.5 MB
print(format_size(1073741824)) # 1.0 GB
Output:
1.2 MB
7.5 MB
1.0 GB
Complete Example: Universal Archive Inspector
import os
import zipfile
import tarfile
import gzip
import struct
def format_size(size_bytes):
for unit in ['B', 'KB', 'MB', 'GB']:
if abs(size_bytes) < 1024:
return f"{size_bytes:.1f} {unit}"
size_bytes /= 1024
return f"{size_bytes:.1f} TB"
def inspect_archive(filepath):
"""Inspect any supported archive and show size information."""
ext = os.path.splitext(filepath)[1].lower()
disk_size = os.path.getsize(filepath)
print(f"File: {os.path.basename(filepath)}")
print(f"Size on disk: {format_size(disk_size)}")
if ext == '.zip':
with zipfile.ZipFile(filepath, 'r') as zf:
total = sum(i.file_size for i in zf.infolist() if not i.is_dir())
print(f"Uncompressed: {format_size(total)}")
print(f"Files: {len([i for i in zf.infolist() if not i.is_dir()])}")
elif ext == '.gz':
with open(filepath, 'rb') as f:
f.seek(-4, 2)
uncompressed = struct.unpack('<I', f.read(4))[0]
print(f"Uncompressed: {format_size(uncompressed)} (approx)")
elif ext in ('.tgz',) or filepath.endswith('.tar.gz'):
with tarfile.open(filepath, 'r:gz') as tar:
total = sum(m.size for m in tar.getmembers() if m.isfile())
print(f"Uncompressed: {format_size(total)}")
print(f"Files: {len([m for m in tar.getmembers() if m.isfile()])}")
ratio = 1 - (disk_size / total) if 'total' in dir() and total > 0 else None
if ratio is not None:
print(f"Space saved: {ratio:.1%}")
# Usage
inspect_archive("data.zip")
Quick Reference
| Format | Module | Get Compressed Size | Get Uncompressed Size |
|---|---|---|---|
| ZIP | zipfile | info.compress_size | info.file_size |
| GZIP | gzip | os.path.getsize() | Last 4 bytes of file (or decompress) |
| TAR.GZ | tarfile | os.path.getsize() | member.size |
| Any file | os | os.path.getsize() | N/A (already uncompressed) |
Conclusion
To get file sizes in Python, use os.path.getsize() for the size on disk and format-specific modules for internal archive sizes.
- For ZIP files, the
zipfilemodule provides bothfile_size(uncompressed) andcompress_size(compressed) for each entry. - For GZIP files, the compressed size is the file size on disk, and the uncompressed size can be read from the GZIP footer (for files under 4 GB) or by decompressing the data. For TAR.GZ archives, the
tarfilemodule exposes each member's original size.
Choose the method that matches your archive format and file size requirements.