OpenAI: Audio, Transcribe

OpenAI: Audio, 文字起こし

Transcribes from audio file. The “whisper-1” model in the OpenAI API converts speech data into text data. If set abbreviations and technical terms as summary (PROMPT), more accurate data conversion will be achieved.

Auto Step icon
Configs for this Auto Step
AuthzConfU
U: Select HTTP_Authz Setting (Secret API Key as “Fixed Value”) *
SelectConfA1
A1: Select FILE for Audio *
StrConfA2
A2: Set Request Summary PROMPT#{EL}
StrConfB1
B1: Set Sampling Temperature (default “0”)#{EL}
StrConfB2
B2: Set Language (default null)#{EL}
SelectConfC1
C1: Select STRING that stores Transcribed data (update)
SelectConfC2
C2: Select STRING that stores Transcribed data with LF (update)
SelectConfD1
D1: If to store Response JSON as a whole, Select STRING (update)
SelectConfD2
D2: If to store Audio Language, Select STRING (update)
SelectConfD3
D3: If to store Audio Duration, Select DECIMAL|STRING (update)
SelectConfD4
D4: If to store Audio Segments, Select DECIMAL|STRING (update)
Script (click to open)
// GraalJS Script (engine type: 2)

//////// START "main()" /////////////////////////////////////////////////////////////////

main();
function main(){ 

////// == Config Retrieving / 工程コンフィグの参照 ==
const strAuthzSetting   = configs.get      ( "AuthzConfU" );   /// REQUIRED
  engine.log( " AutomatedTask Config: Authz Setting: " + strAuthzSetting );
const filesPocketAudio  = configs.getObject( "SelectConfA1" ); /// REQUIRED
  let filesAudio        = engine.findData( filesPocketAudio );
  if( filesAudio      === null ) {
    throw new Error( "\n AutomatedTask UnexpectedFileError:" +
                     " No File {A1} is attached \n" );
  }else{ // java.util.ArrayList of QfileView
    engine.log( " AutomatedTask FilesArray {A1}: " +
                 filesAudio.size() + " file(s)" );
  }
const strPrompt         = configs.get      ( "StrConfA2" );    // NotRequired
const strTemperature    = configs.get      ( "StrConfB1" );    // NotRequired
const strLanguage       = configs.get      ( "StrConfB2" );    // NotRequired
const strPocketText     = configs.getObject( "SelectConfC1" ); // NotRequired
const strPocketLfText   = configs.getObject( "SelectConfC2" ); // NotRequired
const strPocketJson     = configs.getObject( "SelectConfD1" ); // NotRequired
const strPocketLang     = configs.getObject( "SelectConfD2" ); // NotRequired
const numstrPocketDuration = configs.getObject( "SelectConfD3" ); // NotRequired
const numstrPocketSegments = configs.getObject( "SelectConfD4" ); // NotRequired



////// == Data Retrieving / ワークフローデータの参照 ==
// (Nothing. Retrieved via Expression Language in Config Retrieving)


////// == Calculating / 演算 ==
//// OpenAI API > Documentation > API REFERENCE > CHAT
//// https://platform.openai.com/docs/api-reference/audio

/// prepare request1
let request1Uri = "https://api.openai.com/v1/audio/transcriptions";
let request1 = httpClient.begin(); // HttpRequestWrapper
    request1 = request1.authSetting( strAuthzSetting ); // with "Authorization: Bearer XX"
    request1 = request1.multipart( "file", filesAudio.get(0) );
    request1 = request1.multipart( "model", "whisper-1" );
    request1 = request1.multipart( "response_format", "verbose_json" ); // "vtt" to WebVTT
    if ( strPrompt !== "" ) {
      request1 = request1.multipart( "prompt",      strPrompt );
    }
    if ( strTemperature !== "" ) {
      request1 = request1.multipart( "temperature", strTemperature );
    }
    if ( strLanguage !== "" ) {
      request1 = request1.multipart( "language",    strLanguage  );
    }

/// try request1
const response1     = request1.post( request1Uri ); // HttpResponseWrapper
engine.log( " AutomatedTask ApiRequest1 Start: " + request1Uri );
const response1Code = response1.getStatusCode() + ""; // JavaNum to string
const response1Body = response1.getResponseAsString();
engine.log( " AutomatedTask ApiResponse1 Status: " + response1Code );
if( response1Code !== "200"){
  throw new Error( "\n AutomatedTask UnexpectedResponseError: " +
                    response1Code + "\n" + response1Body + "\n" );
}

/// parse response1
/* engine.log( response1Body ); // debug
{
  "task":"transcribe",
  "language":"japanese",
  "duration":19.41,
  "segments":[{
    "id":0,
    "seek":0,
    "start":1.0,
    "end":10.0,
    "text":"むかしむかし、あるところに、お爺さんとお婆さんがありました。",
    "tokens":[33350,3703,2849, ,,,  ,1543],
    "temperature":0.0,
    "avg_logprob":-0.1584527131282922,
    "compression_ratio":0.5641025641025641,
    "no_speech_prob":0.0035369042307138443,
    "transient":false
  },{
    "id":1,
    "seek":0,
    "start":10.0,
    "end":17.0,
    "text":"お爺さんは山へ芝刈りに、お婆さんも山へ芝刈りに行きました。",
    "tokens":[6117,8164,118, ,,, ,1543],
    "temperature":0.0,
    "avg_logprob":-0.1584527131282922,
    "compression_ratio":0.5641025641025641,
    "no_speech_prob":0.0035369042307138443,
    "transient":false
  },{
    "id":2,
    "seek":1700,
    "start":17.0,
    "end":42.0,
    "text":"なんでやねん。",
    "tokens":[50364,11847,2474,7355,5555,3225,1543,51614],
    "temperature":0.0,
    "avg_logprob":-0.5128163761562772,
    "compression_ratio":0.25,
    "no_speech_prob":0.04882895573973656,
    "transient":false
  }],
  "text":"むかしむかし、 ,,, さんも山へ芝刈りに行きました。 なんでやねん。"
}
*/
const response1Obj = JSON.parse( response1Body );


/// extract text
let arrLfText = [];
for ( let i = 0; i < response1Obj.segments.length; i++ ){
  arrLfText.push ( response1Obj.segments[i].text );
}


////// == Data Updating / ワークフローデータへの代入 ==

if( strPocketText !== null ){
  engine.setData( strPocketText,
                  response1Obj.text ?? ""
                );
}
if( strPocketLfText !== null ){
  engine.setData( strPocketLfText,
                  arrLfText.join( '\n' )
                );
}
if( strPocketJson !== null ){
  engine.setData( strPocketJson,
                  response1Body
                );
}
if( strPocketLang !== null ){
  engine.setData( strPocketLang,
                  response1Obj?.language ?? ""
                );
}
if( numstrPocketDuration !== null ){
  if( numstrPocketSegments.matchDataType( "STRING" ) ){
    engine.setData( numstrPocketDuration,
                  response1Obj?.duration + "" ?? ""
                  );
  } else {
    engine.setData( numstrPocketDuration, new java.math.BigDecimal(
                    response1Obj?.duration ?? 0
                  ));
  }
}
if( numstrPocketSegments !== null ){
  if( numstrPocketSegments.matchDataType( "STRING" ) ){
    engine.setData( numstrPocketSegments,
                    response1Obj.segments.length + ""
                  );
  } else {
    engine.setData( numstrPocketSegments, new java.math.BigDecimal(
                    response1Obj.segments.length
                  ));
  }
}


} //////// END "main()" /////////////////////////////////////////////////////////////////


/*
Notes:
- If you place this "Automated Step" in the Workflow diagram,
    - the request will be automatically sent every time the process token arrives.
    - A request is automatically sent to the OpenAI API server. (REST API)
    - The response from the OpenAI API server is automatically parsed.
    - You can incorporate "AI assistance" into your business processes.
- Audio File: Assume that the file for storing FILE type is the audio source.
    - One audio source file should be stored.
    - The second and subsequent files are not referenced.
    - Audio file to transcribe
        - "mp3", "mp4", "mpeg", "mpga", "m4a", "wav", or "webm"
- PROMPT: Text to improve the quality of the generated transcripts
    - Words or acronyms that the model often misrecognizes in the audio
    - In case divided to some audio files, the transcribed text of the previous data
        - The final 224 tokens are considered
- An API key is required to use the OpenAI API.
    - Get an API Key in advance.
    - Set "Secret API Key" to "HTTP Authz Setting" (Token Fixed Value)

APPENDIX:
- Sampling temperature
    - range: "[0,1]", default: "0"
- Language
    - ISO 639-1
    - eg. "`en`", "`ja`", "`fr`", "`de`", "`pt`", "`es`", "`ko`", "`nl`" , "`zh`",,,
    - https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
- Headers for developers belonging to multiple organizations are not yet supported (as of 202303).
    - `OpenAI-Organization`
- Supported subtitle and closed caption files (YouTube)
    - "`.srt`", "`.vtt`", etc
    - https://support.google.com/youtube/answer/2734698?hl=en


Notes-ja:
- この[自動工程]をワークフロー図に配置すれば、案件が到達する度にリクエストが自動送信されます。
    - OpenAI API サーバに対してリクエストが自動送出されます。(REST API通信)
    - OpenAI API サーバからのレスポンスが自動解析されます。
    - "AI による支援" を業務プロセスに組み込むことが出来ます。
- 音声ファイル: ファイル型データの格納ファイルを音源とします。
    - 1つの音源ファイルが保存されているようにします。
    - 2つ目以降のファイルは参照しません。
    - 音声ファイル(会議音声等)フォーマット
        - "mp3", "mp4", "mpeg", "mpga", "m4a", "wav", or "webm"
- 概要 PROMPT: 生成文の品質を向上させるためのテキストを登録します。
    - 文字起こしの参考となる概要説明テキスト
        - モデルに誤認されがちな略語や熟語など
    - 音声データを分割して文字起こしする場合、直前の文字起こしテキスト
        - 末尾 224ワードToken程度が参照されます。
    - 設定例 "お爺さんとお婆さんが芝刈りに行く話です。"
        - "おじいさんは山へしばっかりに おばあさんも山へしばっかりに行きました。"
        - ⇒ "お爺さんは山へ芝刈りに、お婆さんも山へ芝刈りに行きました。"
- OpenAI API の利用には API key が必要です。
    - あらかじめ API Key を取得しておいてください。
    - "Secret API Key" のセット: [HTTP 認証設定]>[トークン直接指定]

APPENDIX-ja:
- サンプリング温度設定(temperature)
    - range:"[0,1]", default:"0"
- 言語設定(language)
    - ISO 639-1
    - eg. "`en`", "`ja`", "`fr`", "`de`", "`pt`", "`es`", "`ko`", "`nl`", "`zh`",,,
    - https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
- 複数組織に所属する開発者向けのヘッダには未対応です(202303時点)
    - `OpenAI-Organization`
- YouTube がサポートする字幕ファイル
    - "`.srt`", "`.vtt`", etc
    - https://support.google.com/youtube/answer/2734698?hl=ja
*/

Download

warning Freely modifiable JavaScript (ECMAScript) code. No warranty of any kind.
(Installing Addon Auto-Steps are available only on the Professional edition.)

Notes

  • If you place this automated step in the Workflow diagram,
    • the request will be automatically sent every time the process token arrives.
    • A request is automatically sent to the OpenAI API server. (REST API)
    • The response from the OpenAI API server is automatically parsed.
    • You can incorporate “AI assistance” into your business processes.
  • Audio File: Assume that the file for storing FILE type is the audio source.
    • One audio source file should be stored.
    • The second and subsequent files are not referenced.
    • Audio file to transcribe
      • “mp3”, “mp4”, “mpeg”, “mpga”, “m4a”, “wav”, or “webm”
      • Video files (mp4, etc.) are also supported.
      • Consider the size limit on the API side.
        • Status 413: “Maximum content size limit (26214400)” (about 26MB) (as of 202303)
      • Note: Upload in Questetra BPM Suite is limited to 100MB (as of 202303)
  • PROMPT: Text to improve the quality of the generated transcripts
    • Words or acronyms that the model often misrecognizes in the audio
    • In case divided to some audio files, the transcribed text of the previous data
      • The final 224 tokens are considered
  • An API key is required to use the OpenAI API.
    • Get an API Key in advance.
    • Set “Secret API Key” to “HTTP Authz Setting” (Token Fixed Value)

Capture

Transcribes from audio file. The "whisper-1" model in the OpenAI API converts speech data into text data. If set abbreviations and technical terms as summary (PRMOPT), more accurate data conversion will be achieved.

Appendix

See Also

Leave a Reply

This site uses Akismet to reduce spam. Learn how your comment data is processed.

%d bloggers like this: