返回博客
计算机视觉
阅读时长 7 min read
计算机视觉入门:核心概念与应用
计算机视觉基础的综合指南,包括图像处理、特征检测和现代神经网络。
计算机视觉基础
计算机视觉是人工智能领域中最激动人心的方向之一,它使计算机能够从图像或视频中获取高层次理解。
图像基础
数字图像表示
数字图像本质上是二维数组,每个元素称为像素。对于灰度图像,每个像素是一个 0-255 的整数值;对于彩色图像,通常使用 RGB 三个通道:
import numpy as np
from PIL import Image
# 读取图像
img = Image.open('image.jpg')
img_array = np.array(img)
print(f"图像形状: {img_array.shape}") # (height, width, channels)
print(f"数据类型: {img_array.dtype}") # uint8
print(f"像素值范围: {img_array.min()} - {img_array.max()}")
基本图像操作
使用 OpenCV 进行基本的图像处理:
import cv2
import numpy as np
# 读取图像(BGR 格式)
img = cv2.imread('image.jpg')
# 转换为灰度图
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# 高斯模糊
blurred = cv2.GaussianBlur(img, (5, 5), 0)
# 边缘检测(Canny)
edges = cv2.Canny(gray, 100, 200)
# 显示结果
cv2.imshow('Original', img)
cv2.imshow('Edges', edges)
cv2.waitKey(0)
cv2.destroyAllWindows()
特征检测
Harris 角点检测
角点是图像中梯度变化剧烈的点,是图像配准和跟踪的重要特征:
def detect_harris_corners(img):
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
gray = np.float32(gray)
# Harris 角点检测
corners = cv2.cornerHarris(gray, blockSize=2, ksize=3, k=0.04)
# 标记角点
corners = cv2.dilate(corners, None)
img[corners > 0.01 * corners.max()] = [0, 0, 255]
return img
ORB 特征描述符
ORB(Oriented FAST and Rotated BRIEF)是一种快速且具有旋转不变性的特征描述符:
def detect_orb_features(img):
# 初始化 ORB 检测器
orb = cv2.ORB_create(nfeatures=1000)
# 检测关键点和计算描述符
keypoints, descriptors = orb.detectAndCompute(img, None)
# 绘制关键点
img_with_keypoints = cv2.drawKeypoints(
img, keypoints, None, color=(0, 255, 0), flags=0
)
return img_with_keypoints, keypoints, descriptors
特征匹配
使用 FLANN 匹配器进行特征点匹配:
def match_features(desc1, desc2, img1, img2, kp1, kp2):
# FLANN 参数
FLANN_INDEX_KDTREE = 1
index_params = dict(algorithm=FLANN_INDEX_KDTREE, trees=5)
search_params = dict(checks=50)
flann = cv2.FlannBasedMatcher(index_params, search_params)
matches = flann.knnMatch(desc1, desc2, k=2)
# 应用比率测试
good_matches = []
for m, n in matches:
if m.distance < 0.7 * n.distance:
good_matches.append(m)
# 绘制匹配结果
result = cv2.drawMatches(img1, kp1, img2, kp2, good_matches, None, flags=2)
return result
深度学习时代
现代计算机视觉主要基于卷积神经网络(CNN)和 Vision Transformer。
CNN 基础
卷积神经网络通过卷积层、池化层和全连接层提取图像特征:
import torch
import torch.nn as nn
import torch.nn.functional as F
class SimpleCNN(nn.Module):
def __init__(self, num_classes=10):
super().__init__()
# 卷积层
self.conv1 = nn.Conv2d(3, 32, kernel_size=3, padding=1)
self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
self.conv3 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
# 池化层
self.pool = nn.MaxPool2d(2, 2)
# 全连接层
self.fc1 = nn.Linear(128 * 4 * 4, 512)
self.fc2 = nn.Linear(512, num_classes)
# Dropout
self.dropout = nn.Dropout(0.5)
def forward(self, x):
# 卷积块
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = self.pool(F.relu(self.conv3(x)))
# 展平
x = x.view(-1, 128 * 4 * 4)
# 全连接层
x = self.dropout(F.relu(self.fc1(x)))
x = self.fc2(x)
return x
数据增强
数据增强可以扩充训练数据集,提高模型泛化能力:
from torchvision import transforms
train_transform = transforms.Compose([
transforms.RandomResizedCrop(224),
transforms.RandomHorizontalFlip(),
transforms.RandomRotation(15),
transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
])
val_transform = transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
])
迁移学习
使用预训练模型可以显著提高性能并减少训练时间:
import torchvision.models as models
# 加载预训练的 ResNet50
model = models.resnet50(pretrained=True)
# 冻结特征提取层
for param in model.parameters():
param.requires_grad = False
# 替换最后的分类层
num_features = model.fc.in_features
model.fc = nn.Linear(num_features, num_classes)
# 只训练最后的分类层
optimizer = torch.optim.Adam(model.fc.parameters(), lr=0.001)
实际应用
目标检测
使用 YOLO 进行实时目标检测:
import torch
# 加载 YOLOv5 模型
model = torch.hub.load('ultralytics/yolov5', 'yolov5s')
# 推理
img = 'image.jpg'
results = model(img)
# 显示结果
results.show()
results.save()
图像分割
使用 U-Net 进行语义分割:
class UNet(nn.Module):
def __init__(self, in_channels=3, out_channels=1):
super().__init__()
# 编码器
self.enc1 = self.conv_block(in_channels, 64)
self.enc2 = self.conv_block(64, 128)
self.enc3 = self.conv_block(128, 256)
self.enc4 = self.conv_block(256, 512)
# 瓶颈层
self.bottleneck = self.conv_block(512, 1024)
# 解码器
self.up1 = nn.ConvTranspose2d(1024, 512, 2, stride=2)
self.dec1 = self.conv_block(1024, 512)
self.up2 = nn.ConvTranspose2d(512, 256, 2, stride=2)
self.dec2 = self.conv_block(512, 256)
# 最终卷积
self.final = nn.Conv2d(256, out_channels, 1)
def conv_block(self, in_channels, out_channels):
return nn.Sequential(
nn.Conv2d(in_channels, out_channels, 3, padding=1),
nn.ReLU(),
nn.Conv2d(out_channels, out_channels, 3, padding=1),
nn.ReLU()
)
def forward(self, x):
# 编码器路径
enc1 = self.enc1(x)
enc2 = self.enc2(F.max_pool2d(enc1, 2))
enc3 = self.enc3(F.max_pool2d(enc2, 2))
enc4 = self.enc4(F.max_pool2d(enc3, 2))
# 瓶颈
bottleneck = self.bottleneck(F.max_pool2d(enc4, 2))
# 解码器路径
up1 = self.up1(bottleneck)
dec1 = self.dec1(torch.cat([up1, enc4], dim=1))
up2 = self.up2(dec1)
dec2 = self.dec2(torch.cat([up2, enc3], dim=1))
return self.final(dec2)
总结
计算机视觉是一个快速发展的领域,从传统的图像处理技术到现代的深度学习方法,已经产生了许多强大的应用。
关键要点:
- 掌握图像基础操作和 OpenCV
- 理解特征检测和匹配算法
- 学习 CNN 和 Vision Transformer
- 实践迁移学习和数据增强
- 关注实时目标检测和图像分割等应用
分享这篇文章