Appearance
第58天:模型量化
学习目标
- 理解量化基本概念
- 掌握动态量化方法
- 学习静态量化技术
- 了解4位量化
- 掌握GPTQ量化
量化基本概念
什么是量化
量化是将模型参数从高精度(如FP32)转换为低精度(如INT8)的过程,以减少模型大小和计算资源需求。
核心思想:
FP32 (32位浮点) → INT8 (8位整数) → 模型大小减少4倍量化类型:
- 动态量化:运行时动态计算量化参数
- 静态量化:使用校准数据预先计算量化参数
- 训练后量化:训练完成后进行量化
- 量化感知训练:训练过程中模拟量化
量化优势
1. 减少模型大小
python
def compare_model_sizes():
model_fp32 = load_model("model_fp32")
model_int8 = quantize_dynamic(model_fp32)
size_fp32 = get_model_size(model_fp32)
size_int8 = get_model_size(model_int8)
print(f"FP32 Model Size: {size_fp32:.2f} MB")
print(f"INT8 Model Size: {size_int8:.2f} MB")
print(f"Reduction: {(1 - size_int8/size_fp32)*100:.1f}%")2. 加速推理
python
def compare_inference_speed():
model_fp32 = load_model("model_fp32")
model_int8 = quantize_dynamic(model_fp32)
input_data = generate_test_data(100)
start_time = time.time()
for _ in range(100):
model_fp32(input_data)
fp32_time = time.time() - start_time
start_time = time.time()
for _ in range(100):
model_int8(input_data)
int8_time = time.time() - start_time
print(f"FP32 Inference Time: {fp32_time:.2f}s")
print(f"INT8 Inference Time: {int8_time:.2f}s")
print(f"Speedup: {fp32_time/int8_time:.2f}x")3. 降低显存需求
python
def compare_memory_usage():
model_fp32 = load_model("model_fp32")
model_int8 = quantize_dynamic(model_fp32)
mem_fp32 = get_memory_usage(model_fp32)
mem_int8 = get_memory_usage(model_int8)
print(f"FP32 Memory: {mem_fp32:.2f} GB")
print(f"INT8 Memory: {mem_int8:.2f} GB")
print(f"Reduction: {(1 - mem_int8/mem_fp32)*100:.1f}%")动态量化
基本实现
python
import torch
import torch.nn as nn
class DynamicQuantizer:
def __init__(self, model: nn.Module):
self.model = model
def quantize(self, qconfig_spec: dict = None):
if qconfig_spec is None:
qconfig_spec = {
nn.Linear: torch.quantization.default_dynamic_qconfig,
nn.LSTM: torch.quantization.default_dynamic_qconfig,
nn.GRU: torch.quantization.default_dynamic_qconfig
}
model_prepared = torch.quantization.prepare_dynamic(
self.model,
qconfig_spec
)
model_quantized = torch.quantization.convert(model_prepared)
return model_quantized
def quantize_to_int8(self):
model_int8 = torch.quantization.quantize_dynamic(
self.model,
{nn.Linear, nn.Conv2d},
dtype=torch.qint8
)
return model_int8
def quantize_to_float16(self):
model_fp16 = self.model.half()
return model_fp16动态量化应用
python
class DynamicQuantizationApp:
def __init__(self, model_path: str):
self.model = self._load_model(model_path)
self.quantizer = DynamicQuantizer(self.model)
def _load_model(self, model_path: str):
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(model_path)
model.eval()
return model
def quantize_model(self, dtype: str = "int8"):
if dtype == "int8":
quantized_model = self.quantizer.quantize_to_int8()
elif dtype == "float16":
quantized_model = self.quantizer.quantize_to_float16()
else:
raise ValueError(f"Unsupported dtype: {dtype}")
return quantized_model
def compare_performance(self, test_data):
original_model = self.model
quantized_model = self.quantize_model("int8")
original_time = self._measure_inference_time(original_model, test_data)
quantized_time = self._measure_inference_time(quantized_model, test_data)
return {
"original_time": original_time,
"quantized_time": quantized_time,
"speedup": original_time / quantized_time
}
def _measure_inference_time(self, model, test_data, n_runs: int = 100):
start_time = time.time()
for _ in range(n_runs):
with torch.no_grad():
_ = model(**test_data)
return (time.time() - start_time) / n_runs静态量化
基本实现
python
class StaticQuantizer:
def __init__(self, model: nn.Module):
self.model = model
def quantize(self, calibration_loader: list):
model_prepared = torch.quantization.prepare(
self.model,
inplace=False
)
with torch.no_grad():
for data in calibration_loader:
model_prepared(data)
model_quantized = torch.quantization.convert(model_prepared)
return model_quantized
def calibrate(self, calibration_loader: list):
model_prepared = torch.quantization.prepare(
self.model,
inplace=False
)
with torch.no_grad():
for data in calibration_loader:
model_prepared(data)
return model_prepared静态量化应用
python
class StaticQuantizationApp:
def __init__(self, model_path: str):
self.model = self._load_model(model_path)
self.quantizer = StaticQuantizer(self.model)
def _load_model(self, model_path: str):
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(model_path)
model.eval()
return model
def prepare_calibration_data(self, dataset, n_samples: int = 100):
calibration_data = []
for i, item in enumerate(dataset):
if i >= n_samples:
break
calibration_data.append(item)
return calibration_data
def quantize_model(self, calibration_data):
quantized_model = self.quantizer.quantize(calibration_data)
return quantized_model
def evaluate_quantized_model(self, quantized_model, test_dataset):
evaluator = ModelEvaluator(quantized_model)
metrics = evaluator.evaluate(test_dataset)
return metrics4位量化
NF4量化
python
class NF4Quantizer:
def __init__(self, model: nn.Module):
self.model = model
def quantize_to_nf4(self):
try:
from transformers import BitsAndBytesConfig, AutoModelForCausalLM
except ImportError:
raise ImportError("Install transformers: pip install transformers")
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=torch.bfloat16
)
return self.model
def quantize_to_fp4(self):
try:
from transformers import BitsAndBytesConfig
except ImportError:
raise ImportError("Install transformers: pip install transformers")
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="fp4",
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=torch.bfloat16
)
return self.model4位量化应用
python
class FourBitQuantizationApp:
def __init__(self, model_path: str):
self.model = self._load_model(model_path)
self.quantizer = NF4Quantizer(self.model)
def _load_model(self, model_path: str):
from transformers import AutoModelForCausalLM, AutoTokenizer
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=torch.bfloat16
)
model = AutoModelForCausalLM.from_pretrained(
model_path,
quantization_config=bnb_config,
device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_path)
return model, tokenizer
def compare_memory_usage(self):
model_fp32 = load_model_fp32(self.model.config.name_or_path)
model_int4 = self.model
mem_fp32 = get_memory_usage(model_fp32)
mem_int4 = get_memory_usage(model_int4)
return {
"fp32_memory": mem_fp32,
"int4_memory": mem_int4,
"reduction": (1 - mem_int4/mem_fp32) * 100
}
def evaluate_quantized_model(self, test_dataset):
evaluator = ModelEvaluator(self.model)
metrics = evaluator.evaluate(test_dataset)
return metricsGPTQ量化
基本实现
python
class GPTQQuantizer:
def __init__(self, model: nn.Module):
self.model = model
def quantize_gptq(self, bits: int = 4,
groupsize: int = 128):
try:
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
except ImportError:
raise ImportError("Install auto-gptq: pip install auto-gptq")
quantize_config = BaseQuantizeConfig(
bits=bits,
groupsize=groupsize,
damp_percent=0.01,
desc_act=False
)
model_quantized = AutoGPTQForCausalLM.from_pretrained(
self.model.config.name_or_path,
quantize_config=quantize_config
)
return model_quantized
def save_quantized_model(self, model, save_path: str):
model.save_quantized(save_path)GPTQ量化应用
python
class GPTQQuantizationApp:
def __init__(self, model_path: str):
self.model_path = model_path
self.quantizer = GPTQQuantizer(None)
def quantize_model(self, bits: int = 4,
groupsize: int = 128):
model_quantized = self.quantizer.quantize_gptq(bits, groupsize)
return model_quantized
def evaluate_quantized_model(self, quantized_model, test_dataset):
evaluator = ModelEvaluator(quantized_model)
metrics = evaluator.evaluate(test_dataset)
return metrics
def compare_quantization_methods(self, test_dataset):
results = {}
for bits in [4, 8]:
for groupsize in [64, 128]:
model_quantized = self.quantize_model(bits, groupsize)
metrics = self.evaluate_quantized_model(
model_quantized,
test_dataset
)
results[f"{bits}bit_{groupsize}"] = metrics
return results量化对比
性能对比
python
class QuantizationComparator:
def __init__(self, model_path: str):
self.model_path = model_path
def compare_all_methods(self, test_dataset):
results = {}
model_fp32 = self._load_fp32_model()
results["fp32"] = self._evaluate_model(model_fp32, test_dataset)
model_int8 = self._quantize_dynamic(model_fp32)
results["int8_dynamic"] = self._evaluate_model(model_int8, test_dataset)
model_int4 = self._quantize_nf4(model_fp32)
results["int4_nf4"] = self._evaluate_model(model_int4, test_dataset)
model_gptq = self._quantize_gptq(model_fp32)
results["gptq_4bit"] = self._evaluate_model(model_gptq, test_dataset)
return results
def _load_fp32_model(self):
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(self.model_path)
model.eval()
return model
def _quantize_dynamic(self, model):
quantizer = DynamicQuantizer(model)
return quantizer.quantize_to_int8()
def _quantize_nf4(self, model):
quantizer = NF4Quantizer(model)
return quantizer.quantize_to_nf4()
def _quantize_gptq(self, model):
quantizer = GPTQQuantizer(model)
return quantizer.quantize_gptq()
def _evaluate_model(self, model, test_dataset):
evaluator = ModelEvaluator(model)
metrics = evaluator.evaluate(test_dataset)
memory_usage = get_memory_usage(model)
model_size = get_model_size(model)
return {
**metrics,
"memory_usage": memory_usage,
"model_size": model_size
}实践练习
练习1:实现动态量化
python
def quantize_model_dynamically(model_path: str):
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(model_path)
model_int8 = torch.quantization.quantize_dynamic(
model,
{torch.nn.Linear},
dtype=torch.qint8
)
return model_int8练习2:实现NF4量化
python
def quantize_model_nf4(model_path: str):
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_use_double_quant=True
)
model = AutoModelForCausalLM.from_pretrained(
model_path,
quantization_config=bnb_config
)
return model总结
本节我们学习了模型量化:
- 量化基本概念和优势
- 动态量化方法
- 静态量化技术
- 4位量化(NF4、FP4)
- GPTQ量化
量化是降低模型部署成本的关键技术。
