Examples
Practical examples showing how to use the ARFF Format Converter v2.0 in various scenarios.
Quick Start Examples
Basic File Conversion
from pathlib import Path
from arff_format_converter import ARFFConverter
# Initialize converter with default settings
converter = ARFFConverter()
# Convert ARFF to Parquet (recommended for performance)
result = converter.convert(
input_file=Path("iris.arff"),
output_dir=Path("./output"),
output_format="parquet"
)
print(f"ā
Conversion completed in {result.duration:.2f}s")
print(f"š Output: {result.output_file}")
print(f"š File size: {result.file_size_mb:.1f} MB")Ultra-Performance Mode
# Configure for maximum speed (production use)
converter = ARFFConverter(
fast_mode=True, # Skip validation for speed
parallel=True, # Multi-core processing
use_polars=True, # Polars optimization
memory_map=True, # Memory mapping for large files
chunk_size=100000 # Large chunks
)
# Convert large dataset
result = converter.convert(
input_file=Path("large_dataset.arff"),
output_dir=Path("./output"),
output_format="parquet"
)
print(f"ā” Ultra-fast conversion: {result.duration:.2f}s")
print(f"š Processing speed: {result.rows_processed / result.duration:.0f} rows/sec")Batch Processing
Convert Multiple Files
from pathlib import Path
from arff_format_converter import ARFFConverter
converter = ARFFConverter(fast_mode=True, parallel=True)
# Find all ARFF files in directory
input_files = list(Path("datasets").glob("*.arff"))
print(f"Found {len(input_files)} ARFF files to convert")
# Batch convert to Parquet
results = converter.batch_convert(
input_files=input_files,
output_dir=Path("./parquet_output"),
output_format="parquet",
parallel=True
)
# Process results
total_time = sum(r.duration for r in results)
total_size = sum(r.file_size_mb for r in results)
print(f"\nš Batch Conversion Summary:")
print(f" Files processed: {len(results)}")
print(f" Total time: {total_time:.2f}s")
print(f" Total output size: {total_size:.1f} MB")
print(f" Average speed: {sum(r.rows_processed for r in results) / total_time:.0f} rows/sec")
# Show individual results
for result in results:
status = "ā
" if result.success else "ā"
print(f"{status} {result.input_file.name}: {result.duration:.2f}s")Format Comparison
# Convert same file to different formats for comparison
formats = ["csv", "json", "parquet", "xlsx", "xml"]
input_file = Path("customer_data.arff")
results = {}
for format_name in formats:
result = converter.convert(
input_file=input_file,
output_dir=Path(f"./output_{format_name}"),
output_format=format_name
)
results[format_name] = result
# Compare results
print("\nš Format Comparison:")
print("Format | Time | Size | Speed")
print("----------|----------|----------|----------")
for fmt, result in results.items():
speed = result.rows_processed / result.duration
print(f"{fmt:<9} | {result.duration:>6.2f}s | {result.file_size_mb:>6.1f}MB | {speed:>7.0f} r/s")
# Find best format
fastest = min(results.items(), key=lambda x: x[1].duration)
smallest = min(results.items(), key=lambda x: x[1].file_size_mb)
print(f"\nš Fastest: {fastest[0]} ({fastest[1].duration:.2f}s)")
print(f"šļø Smallest: {smallest[0]} ({smallest[1].file_size_mb:.1f}MB)")Performance Benchmarking
Comprehensive Benchmark
# Run detailed benchmark across all formats
benchmark_results = converter.benchmark(
input_file=Path("benchmark_dataset.arff"),
formats=["csv", "json", "parquet", "xlsx", "xml", "orc"],
iterations=5 # Multiple runs for accuracy
)
print("\nš Performance Benchmark Results:")
print("=" * 60)
# Sort by performance
sorted_results = sorted(
benchmark_results.items(),
key=lambda x: x[1]['duration']
)
for format_name, metrics in sorted_results:
print(f"\n{format_name.upper()}:")
print(f" ā±ļø Average time: {metrics['duration']:.1f}ms")
print(f" š File size: {metrics['file_size_mb']:.1f}MB")
print(f" š Compression: {metrics['compression_ratio']:.1f}x")
print(f" š Rating: {metrics['speed_rating']}")
# Performance recommendations
fastest_format = sorted_results[0][0]
print(f"\nš Recommended format for speed: {fastest_format}")
smallest_format = min(
benchmark_results.items(),
key=lambda x: x[1]['file_size_mb']
)[0]
print(f"šļø Recommended format for size: {smallest_format}")CLI Examples
Command Line Usage
# Basic conversion
arff-format-converter --file data.arff --output ./output --format parquet
# High-performance mode
arff-format-converter --file large_data.arff --output ./output --format parquet --fast --parallel
# Large file processing with verbose output
arff-format-converter \
--file huge_dataset.arff \
--output ./output \
--format parquet \
--chunk-size 100000 \
--verbose
# Benchmark all formats
arff-format-converter --file test_data.arff --output ./benchmarks --benchmark
# Get system information
arff-format-converter --infoProduction Use Cases
Data Pipeline Integration
import logging
from pathlib import Path
from arff_format_converter import ARFFConverter
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class DataPipeline:
def __init__(self):
self.converter = ARFFConverter(
fast_mode=True,
parallel=True,
use_polars=True,
memory_map=True
)
def process_dataset(self, arff_path: Path, output_dir: Path):
"""Process single ARFF file with error handling"""
try:
logger.info(f"Processing {arff_path.name}...")
# Convert to Parquet for optimal performance
result = self.converter.convert(
input_file=arff_path,
output_dir=output_dir,
output_format="parquet"
)
logger.info(f"ā
Success: {result.duration:.2f}s, "
f"{result.file_size_mb:.1f}MB")
return result
except Exception as e:
logger.error(f"ā Failed to process {arff_path.name}: {e}")
return None
def bulk_process(self, input_dir: Path, output_dir: Path):
"""Process all ARFF files in directory"""
arff_files = list(input_dir.glob("*.arff"))
if not arff_files:
logger.warning(f"No ARFF files found in {input_dir}")
return
logger.info(f"Found {len(arff_files)} files to process")
# Use batch processing for efficiency
results = self.converter.batch_convert(
input_files=arff_files,
output_dir=output_dir,
output_format="parquet",
parallel=True
)
# Report results
successful = [r for r in results if r.success]
failed = [r for r in results if not r.success]
logger.info(f"\nš Pipeline Summary:")
logger.info(f" ā
Successful: {len(successful)}")
logger.info(f" ā Failed: {len(failed)}")
if successful:
avg_time = sum(r.duration for r in successful) / len(successful)
total_size = sum(r.file_size_mb for r in successful)
logger.info(f" ā±ļø Average time: {avg_time:.2f}s")
logger.info(f" š Total output: {total_size:.1f}MB")
# Usage
pipeline = DataPipeline()
pipeline.bulk_process(
input_dir=Path("./raw_data"),
output_dir=Path("./processed_data")
)Memory-Efficient Processing
# For systems with limited memory
converter = ARFFConverter(
fast_mode=False, # Enable validation
parallel=False, # Single-threaded
use_polars=False, # Use pandas only
chunk_size=5000 # Smaller chunks
)
# Process very large file in chunks
def process_large_file(file_path: Path, output_dir: Path):
"""Handle files that don't fit in memory"""
print(f"Processing large file: {file_path.name}")
print("Using memory-efficient settings...")
result = converter.convert(
input_file=file_path,
output_dir=output_dir,
output_format="parquet" # Parquet handles large files well
)
if result.success:
print(f"ā
Large file processed successfully!")
print(f" Time: {result.duration:.2f}s")
print(f" Rows: {result.rows_processed:,}")
print(f" Output: {result.file_size_mb:.1f}MB")
# Calculate processing rate
rate = result.rows_processed / result.duration
print(f" Rate: {rate:.0f} rows/second")
else:
print(f"ā Processing failed: {result.error_message}")
# Example usage
process_large_file(
file_path=Path("very_large_dataset.arff"),
output_dir=Path("./large_output")
)Advanced Use Cases
Custom Validation and Error Handling
from pathlib import Path
from arff_format_converter import ARFFConverter
def safe_convert_with_fallback(input_file: Path, output_dir: Path):
"""Convert with fallback to safer settings on failure"""
# Try ultra-fast mode first
fast_converter = ARFFConverter(
fast_mode=True,
parallel=True,
use_polars=True,
memory_map=True
)
try:
print("š Attempting ultra-fast conversion...")
result = fast_converter.convert(
input_file=input_file,
output_dir=output_dir,
output_format="parquet"
)
if result.success:
print(f"ā
Ultra-fast success: {result.duration:.2f}s")
return result
else:
raise Exception(result.error_message)
except Exception as e:
print(f"ā ļø Ultra-fast failed: {e}")
print("š Falling back to safe mode...")
# Fallback to safe settings
safe_converter = ARFFConverter(
fast_mode=False,
parallel=False,
use_polars=False,
chunk_size=1000
)
try:
result = safe_converter.convert(
input_file=input_file,
output_dir=output_dir,
output_format="csv" # Fallback to simpler format
)
if result.success:
print(f"ā
Safe mode success: {result.duration:.2f}s")
return result
else:
raise Exception(result.error_message)
except Exception as e2:
print(f"ā Both modes failed: {e2}")
return None
# Example usage
result = safe_convert_with_fallback(
input_file=Path("problematic_data.arff"),
output_dir=Path("./output")
)š” Performance Tips
- ⢠Use Parquet format for best performance and compression
- ⢠Enable fast_mode for production workloads (20-30% faster)
- ⢠Use parallel=True on multi-core systems (2-4x speedup)
- ⢠Increase chunk_size for large files (better memory utilization)
- ⢠Enable memory_map for files larger than 1GB
- ⢠Benchmark your specific use case to find optimal settings
šÆ Format Recommendations
- ⢠Parquet: Best overall (speed + compression + compatibility)
- ⢠CSV: Maximum compatibility, human-readable
- ⢠JSON: Web APIs, NoSQL databases
- ⢠ORC: Big data analytics (Spark, Hive)
- ⢠XLSX: Business reports, Excel compatibility
- ⢠XML: Legacy systems, structured documents