How can I use gpu for inference? I already setup g...
# ask-for-help
s
How can I use gpu for inference? I already setup gpus and cuda with bentofile.yaml and k8s deployment. Inferece is working well but, checking with nvidia-smi, there is no process on it.
Copy code
import os
import sys
import bentoml
import mlflow
import boto3
import torch
import typing as t
from <http://bentoml.io|bentoml.io> import Text, JSON
from datetime import datetime



accessKey = os.environ['AWS_ACCESS_KEY_ID']
secretKey = os.environ['AWS_SECRET_ACCESS_KEY']
myRegion = 'ap-northeast-2'

start_time = datetime.now()

client = boto3.client('s3', aws_access_key_id=accessKey,
                            aws_secret_access_key=secretKey,
                            region_name=myRegion)
client.download_file('bucket_name', 'weights/best_both_123.pt', '<http://best.pt|best.pt>')


class YoloV5Runnable(bentoml.Runnable):
    SUPPORTED_RESOURCES = ("<http://nvidia.com/gpu|nvidia.com/gpu>", "cpu")
    SUPPORTS_CPU_MULTI_THREADING = True

    def __init__(self):
        start_time = datetime.now()
        print("Torch Available : ", torch.cuda.is_available())
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self.model = torch.hub.load('ultralytics/yolov5', 'custom', path='<http://best.pt|best.pt>', force_reload=True)
        <http://self.model.to|self.model.to>(self.device)
        self.model.conf = 0.85

        time_elapsed = datetime.now() - start_time
        print('load Time elapsed (hh:mm:<http://ss.ms|ss.ms>) {}'.format(time_elapsed))


    @bentoml.Runnable.method(batchable=False)
    def predict(self, img):
        return self.model(img).pandas().xyxy[0].to_json(orient="records")


yolov5runner = t.cast("RunnerImpl", bentoml.Runner(YoloV5Runnable))
svc = bentoml.Service("ric_service", runners=[yolov5runner])



@svc.api(input=Text(), output=JSON())
def inference(img):
    result = yolov5runner.predict.run(img)
    return result