Skip to content

Commit 433d1a7

Browse files
authored
Merge pull request #200 from sj-distributor/openai-audio-api
Implement Openai audio api
2 parents 340edd8 + 69ff542 commit 433d1a7

File tree

14 files changed

+85
-55
lines changed

14 files changed

+85
-55
lines changed

src/SmartTalk.Api/Startup.cs

-3
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
using Serilog;
2-
using OpenAI.Extensions;
32
using SmartTalk.Messages;
43
using Correlate.AspNetCore;
54
using SmartTalk.Api.Filters;
@@ -39,8 +38,6 @@ public void ConfigureServices(IServiceCollection services)
3938
});
4039

4140
services.AddHangfireInternal(Configuration);
42-
services.AddOpenAIService(settings => { settings.ApiKey = new OpenAiSettings(Configuration).ApiKey; })
43-
.ConfigureHttpClient(http => http.Timeout = TimeSpan.FromSeconds(6000));
4441
}
4542

4643
public void Configure(IApplicationBuilder app, IWebHostEnvironment env)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
alter table `agent` add column `wechat_robot_url` varchar(256) null;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
ALTER TABLE `agent` CHANGE `wechat_robot_url` `wechat_robot_key` VARCHAR(255) null;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
alter table `ai_speech_assistant` add column `voice` varchar(36) null;

src/SmartTalk.Core/Domain/AISpeechAssistant/AiSpeechAssistant.cs

+3
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,9 @@ public class AiSpeechAssistant : IEntity, IHasCreatedFields
2121
[Column("url"), StringLength(512)]
2222
public string Url { get; set; }
2323

24+
[Column("voice"), StringLength(36)]
25+
public string Voice { get; set; }
26+
2427
[Column("provider")]
2528
public AiSpeechAssistantProvider Provider { get; set; }
2629

src/SmartTalk.Core/Domain/System/Agent.cs

+3
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,9 @@ public class Agent : IEntity
1111
[Column("id")]
1212
[DatabaseGenerated(DatabaseGeneratedOption.Identity)]
1313
public int Id { get; set; }
14+
15+
[Column("wechat_robot_key"), StringLength(256)]
16+
public string WechatRobotKey { get; set; }
1417

1518
[Column("relate_id")]
1619
public int RelateId { get; set; }

src/SmartTalk.Core/Mappings/SttMapping.cs

-14
This file was deleted.

src/SmartTalk.Core/Services/Agents/AgentDataProvider.cs

+8-1
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ namespace SmartTalk.Core.Services.Agents;
1111
public interface IAgentDataProvider : IScopedDependency
1212
{
1313
Task<Agent> GetAgentAsync(int? id = null, AgentType? type = null, int? relateId = null, string name = null, CancellationToken cancellationToken = default);
14+
15+
Task<Agent> GetAgentByIdAsync(int id, CancellationToken cancellationToken);
1416

1517
Task<List<Agent>> GetAgentsAsync(AgentType type, CancellationToken cancellationToken);
1618
}
@@ -47,7 +49,12 @@ join restaurant in _repository.Query<Restaurant>() on agent.RelateId equals rest
4749

4850
return query == null ? null : await query.FirstOrDefaultAsync(cancellationToken).ConfigureAwait(false);
4951
}
50-
52+
53+
public async Task<Agent> GetAgentByIdAsync(int id, CancellationToken cancellationToken)
54+
{
55+
return await _repository.Query<Agent>().Where(x => x.Id == id).FirstOrDefaultAsync(cancellationToken).ConfigureAwait(false);
56+
}
57+
5158
public async Task<List<Agent>> GetAgentsAsync(AgentType type, CancellationToken cancellationToken)
5259
{
5360
return await _repository.Query<Agent>().Where(x => x.Type == type).ToListAsync(cancellationToken).ConfigureAwait(false);

src/SmartTalk.Core/Services/AiSpeechAssistant/AiSpeechAssistantService.cs

+39-2
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@
1212
using AutoMapper;
1313
using SmartTalk.Core.Constants;
1414
using Microsoft.AspNetCore.Http;
15+
using OpenAI.Chat;
16+
using SmartTalk.Core.Services.Agents;
17+
using SmartTalk.Core.Services.Http;
1518
using SmartTalk.Messages.Constants;
1619
using SmartTalk.Core.Services.Jobs;
1720
using SmartTalk.Core.Services.PhoneOrder;
@@ -26,6 +29,7 @@
2629
using SmartTalk.Messages.Events.AiSpeechAssistant;
2730
using SmartTalk.Messages.Commands.AiSpeechAssistant;
2831
using SmartTalk.Messages.Commands.PhoneOrder;
32+
using SmartTalk.Messages.Dto.Agent;
2933
using SmartTalk.Messages.Enums.PhoneOrder;
3034
using JsonSerializer = System.Text.Json.JsonSerializer;
3135
using RecordingResource = Twilio.Rest.Api.V2010.Account.Call.RecordingResource;
@@ -53,6 +57,9 @@ public class AiSpeechAssistantService : IAiSpeechAssistantService
5357
private readonly OpenAiSettings _openAiSettings;
5458
private readonly TwilioSettings _twilioSettings;
5559
private readonly ZhiPuAiSettings _zhiPuAiSettings;
60+
private readonly IAgentDataProvider _agentDataProvider;
61+
private readonly IPhoneOrderService _phoneOrderService;
62+
private readonly ISmartTalkHttpClientFactory _httpClientFactory;
5663
private readonly IPhoneOrderDataProvider _phoneOrderDataProvider;
5764
private readonly ISmartTalkBackgroundJobClient _backgroundJobClient;
5865
private readonly IAiSpeechAssistantDataProvider _aiSpeechAssistantDataProvider;
@@ -62,6 +69,9 @@ public AiSpeechAssistantService(
6269
OpenAiSettings openAiSettings,
6370
TwilioSettings twilioSettings,
6471
ZhiPuAiSettings zhiPuAiSettings,
72+
IAgentDataProvider agentDataProvider,
73+
IPhoneOrderService phoneOrderService,
74+
ISmartTalkHttpClientFactory httpClientFactory,
6575
IPhoneOrderDataProvider phoneOrderDataProvider,
6676
ISmartTalkBackgroundJobClient backgroundJobClient,
6777
IAiSpeechAssistantDataProvider aiSpeechAssistantDataProvider)
@@ -70,6 +80,9 @@ public AiSpeechAssistantService(
7080
_openAiSettings = openAiSettings;
7181
_twilioSettings = twilioSettings;
7282
_zhiPuAiSettings = zhiPuAiSettings;
83+
_phoneOrderService = phoneOrderService;
84+
_agentDataProvider = agentDataProvider;
85+
_httpClientFactory = httpClientFactory;
7386
_backgroundJobClient = backgroundJobClient;
7487
_phoneOrderDataProvider = phoneOrderDataProvider;
7588
_aiSpeechAssistantDataProvider = aiSpeechAssistantDataProvider;
@@ -146,6 +159,28 @@ public async Task ReceivePhoneRecordingStatusCallbackAsync(ReceivePhoneRecording
146159

147160
record.Url = command.RecordingUrl;
148161
record.Status = PhoneOrderRecordStatus.Sent;
162+
163+
var agent = await _agentDataProvider.GetAgentByIdAsync(record.AgentId, cancellationToken: cancellationToken).ConfigureAwait(false);
164+
165+
ChatClient client = new("gpt-4o-audio-preview", _openAiSettings.ApiKey);
166+
var audioFileRawBytes = await _httpClientFactory.GetAsync<byte[]>(record.Url, cancellationToken).ConfigureAwait(false);
167+
var audioData = BinaryData.FromBytes(audioFileRawBytes);
168+
List<ChatMessage> messages =
169+
[
170+
new SystemChatMessage("你是一名電話錄音的分析員,通過聽取錄音內容和語氣情緒作出精確分析,冩出一份分析報告。\n\n分析報告的格式:交談主題:xxx\n\n 內容摘要:xxx \n\n 客人情感與情緒: xxx \n\n 待辦事件: \n1.xxx\n2.xxx \n\n 客人下單內容(如果沒有則忽略):1. 牛肉(1箱)\n2.雞腿肉(1箱)"),
171+
new UserChatMessage(ChatMessageContentPart.CreateInputAudioPart(audioData, ChatInputAudioFormat.Wav)),
172+
new UserChatMessage("幫我根據錄音生成分析報告:")
173+
];
174+
175+
ChatCompletionOptions options = new() { ResponseModalities = ChatResponseModalities.Text };
176+
177+
ChatCompletion completion = await client.CompleteChatAsync(messages, options, cancellationToken);
178+
Log.Information("sales record analyze report:" + completion.Content.FirstOrDefault()?.Text);
179+
record.TranscriptionText = completion.Content.FirstOrDefault()?.Text;
180+
181+
if (!string.IsNullOrEmpty(agent.WechatRobotKey))
182+
await _phoneOrderService.SendWorkWeChatRobotNotifyAsync(audioFileRawBytes, agent.WechatRobotKey, "錄音分析報告:\n" + record.TranscriptionText, cancellationToken).ConfigureAwait(false);
183+
149184
await _phoneOrderDataProvider.UpdatePhoneOrderRecordsAsync(record, cancellationToken: cancellationToken).ConfigureAwait(false);
150185
}
151186

@@ -200,7 +235,9 @@ private async Task<WebSocket> ConnectOpenAiRealTimeSocketAsync(Domain.AISpeechAs
200235
var url = string.IsNullOrEmpty(assistant.Url) ? AiSpeechAssistantStore.DefaultUrl : assistant.Url;
201236

202237
await openAiWebSocket.ConnectAsync(new Uri(url), cancellationToken).ConfigureAwait(false);
238+
203239
await SendSessionUpdateAsync(openAiWebSocket, assistant, prompt).ConfigureAwait(false);
240+
204241
return openAiWebSocket;
205242
}
206243

@@ -401,7 +438,7 @@ private async Task SendToTwilioAsync(WebSocket twilioWebSocket, WebSocket openAi
401438
}
402439
}
403440

404-
if (!context.InitialConversationSent)
441+
if (!context.InitialConversationSent && !string.IsNullOrEmpty(context.Assistant.Greetings))
405442
{
406443
await SendInitialConversationItem(openAiWebSocket, context);
407444
context.InitialConversationSent = true;
@@ -697,7 +734,7 @@ private async Task SendSessionUpdateAsync(WebSocket openAiWebSocket, Domain.AISp
697734
turn_detection = new { type = "server_vad" },
698735
input_audio_format = "g711_ulaw",
699736
output_audio_format = "g711_ulaw",
700-
voice = "alloy",
737+
voice = string.IsNullOrEmpty(assistant.Voice) ? "alloy" : assistant.Voice,
701738
instructions = prompt,
702739
modalities = new[] { "text", "audio" },
703740
temperature = 0.8,
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,14 @@
11
using Serilog;
22
using AutoMapper;
33
using System.Text;
4-
using OpenAI.Interfaces;
4+
using OpenAI.Audio;
55
using SmartTalk.Core.Ioc;
6-
using OpenAI.ObjectModels;
76
using SmartTalk.Core.Extensions;
87
using SmartTalk.Messages.Dto.STT;
98
using SmartTalk.Messages.Enums.STT;
109
using SmartTalk.Core.Services.Http;
1110
using SmartTalk.Core.Services.Ffmpeg;
12-
using OpenAI.ObjectModels.RequestModels;
11+
using SmartTalk.Core.Settings.OpenAi;
1312

1413
namespace SmartTalk.Core.Services.STT;
1514

@@ -24,14 +23,14 @@ public class SpeechToTextService : ISpeechToTextService
2423
{
2524
private readonly IMapper _mapper;
2625
private readonly IFfmpegService _ffmpegService;
27-
private readonly IOpenAIService _openAiService;
26+
private readonly OpenAiSettings _openAiSettings;
2827
private readonly ISmartiesHttpClientFactory _httpClientFactory;
2928

30-
public SpeechToTextService(IMapper mapper, IFfmpegService ffmpegService, IOpenAIService openAiService, ISmartiesHttpClientFactory httpClientFactory)
29+
public SpeechToTextService(IMapper mapper, IFfmpegService ffmpegService, OpenAiSettings openAiSettings, ISmartiesHttpClientFactory httpClientFactory)
3130
{
3231
_mapper = mapper;
3332
_ffmpegService = ffmpegService;
34-
_openAiService = openAiService;
33+
_openAiSettings = openAiSettings;
3534
_httpClientFactory = httpClientFactory;
3635
}
3736

@@ -51,52 +50,45 @@ public async Task<string> SpeechToTextAsync(
5150
{
5251
var transcriptionResponse = await TranscriptionAsync(audio, language, fileType, responseFormat, prompt, cancellationToken).ConfigureAwait(false);
5352

54-
transcriptionResult.Append(transcriptionResponse.Text);
53+
transcriptionResult.Append(transcriptionResponse);
5554
}
5655

5756
Log.Information("Transcription result {Transcription}", transcriptionResult.ToString());
5857

5958
return transcriptionResult.ToString();
6059
}
6160

62-
public async Task<AudioTranscriptionResponseDto> TranscriptionAsync(
61+
public async Task<string> TranscriptionAsync(
6362
byte[] file, TranscriptionLanguage? language, TranscriptionFileType fileType = TranscriptionFileType.Wav,
6463
TranscriptionResponseFormat responseFormat = TranscriptionResponseFormat.Vtt, string prompt = null, CancellationToken cancellationToken = default)
6564
{
65+
AudioClient client = new("whisper-1", _openAiSettings.ApiKey);
66+
6667
var filename = $"{Guid.NewGuid()}.{fileType.ToString().ToLower()}";
6768

6869
var fileResponseFormat = responseFormat switch
6970
{
70-
TranscriptionResponseFormat.Vtt => StaticValues.AudioStatics.ResponseFormat.Vtt,
71-
TranscriptionResponseFormat.Srt => StaticValues.AudioStatics.ResponseFormat.Srt,
72-
TranscriptionResponseFormat.Text => StaticValues.AudioStatics.ResponseFormat.Text,
73-
TranscriptionResponseFormat.Json => StaticValues.AudioStatics.ResponseFormat.Json,
74-
TranscriptionResponseFormat.VerboseJson => StaticValues.AudioStatics.ResponseFormat.Vtt
71+
TranscriptionResponseFormat.Vtt => "vtt",
72+
TranscriptionResponseFormat.Srt => "srt",
73+
TranscriptionResponseFormat.Text => "text",
74+
TranscriptionResponseFormat.Json => "json",
75+
TranscriptionResponseFormat.VerboseJson => "verbose_json",
76+
_ => "text"
7577
};
76-
77-
var transcriptionRequest = new AudioCreateTranscriptionRequest
78+
var stream = new MemoryStream(file);
79+
80+
AudioTranscriptionOptions options = new()
7881
{
79-
File = file,
80-
FileName = filename,
81-
Model = Models.WhisperV1,
82-
ResponseFormat = fileResponseFormat
82+
ResponseFormat = AudioTranscriptionFormat.Text,
83+
Prompt = prompt
8384
};
84-
85-
if (language.HasValue) transcriptionRequest.Language = language.Value.GetDescription();
8685

87-
var response = await _httpClientFactory.SafelyProcessRequestAsync(nameof(SpeechToTextAsync), async () =>
88-
await _openAiService.Audio.CreateTranscription(new AudioCreateTranscriptionRequest
89-
{
90-
File = file,
91-
FileName = filename,
92-
Model = Models.WhisperV1,
93-
ResponseFormat = fileResponseFormat,
94-
Language = language?.GetDescription(),
95-
Prompt = prompt
96-
}, cancellationToken).ConfigureAwait(false), cancellationToken).ConfigureAwait(false);
86+
if (language.HasValue) options.Language = language.Value.GetDescription();
87+
88+
var response = await client.TranscribeAudioAsync(stream, "test.wav", options, cancellationToken);
9789

9890
Log.Information("Transcription {FileName} response {@Response}", filename, response);
9991

100-
return _mapper.Map<AudioTranscriptionResponseDto>(response);
92+
return response?.Value?.Text;
10193
}
10294
}

src/SmartTalk.Core/SmartTalk.Core.csproj

+1-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
<PackageReference Include="Autofac.Extensions.DependencyInjection" Version="8.0.0" />
1313
<PackageReference Include="AutoMapper" Version="12.0.1" />
1414
<PackageReference Include="AutoMapper.Contrib.Autofac.DependencyInjection" Version="7.1.0" />
15-
<PackageReference Include="Betalgo.OpenAI" Version="8.5.1" />
1615
<PackageReference Include="dbup" Version="5.0.8" />
1716
<PackageReference Include="dbup-mysql" Version="5.0.10" />
1817
<PackageReference Include="Destructurama.JsonNet" Version="2.0.0" />
@@ -36,6 +35,7 @@
3635
<PackageReference Include="Microsoft.Extensions.Options.ConfigurationExtensions" Version="8.0.0" />
3736
<PackageReference Include="Microsoft.KernelMemory.AI.OpenAI" Version="0.35.240321.1" />
3837
<PackageReference Include="Newtonsoft.Json" Version="13.0.3" />
38+
<PackageReference Include="OpenAI" Version="2.2.0-beta.2" />
3939
<PackageReference Include="OpenCCNET" Version="1.0.2" />
4040
<PackageReference Include="NRedisStack" Version="0.12.0" />
4141
<PackageReference Include="Pomelo.EntityFrameworkCore.MySql" Version="7.0.0" />

src/SmartTalk.IntegrationTests/TestBase.Initial.cs

-2
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
using MySql.Data.MySqlClient;
55
using Newtonsoft.Json;
66
using NSubstitute;
7-
using OpenAI.Interfaces;
87
using Serilog;
98
using SmartTalk.Core;
109
using SmartTalk.Core.DbUpFile;
@@ -46,7 +45,6 @@ private void RegisterBaseContainer(ContainerBuilder containerBuilder)
4645
new SmartTalkModule(logger, configuration, typeof(SmartTalkModule).Assembly, typeof(TestBase).Assembly));
4746

4847
containerBuilder.RegisterInstance(new TestCurrentUser()).As<ICurrentUser>();
49-
containerBuilder.RegisterInstance(Substitute.For<IOpenAIService>()).AsImplementedInterfaces();
5048
containerBuilder.RegisterInstance(Substitute.For<IHttpContextAccessor>()).AsImplementedInterfaces();
5149
containerBuilder.RegisterInstance(Substitute.For<IAliYunOssService>()).AsImplementedInterfaces();
5250
containerBuilder.RegisterInstance(Substitute.For<IEasyPosClient>()).AsImplementedInterfaces();

src/SmartTalk.Messages/Dto/Agent/AgentDto.cs

+2
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@ public class AgentDto
66

77
public int RelateId { get; set; }
88

9+
public string WechatRobotKey { get; set; }
10+
911
public AgentType Type { get; set; }
1012

1113
public DateTimeOffset CreatedDate { get; set; }

src/SmartTalk.Messages/Dto/AiSpeechAssistant/AiSpeechAssistantDto.cs

+2
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@ public class AiSpeechAssistantDto
1212

1313
public string Url { get; set; }
1414

15+
public string Voice { get; set; }
16+
1517
public AiSpeechAssistantProvider Provider { get; set; }
1618

1719
public int AgentId { get; set; }

0 commit comments

Comments
 (0)