2つのTSV文字列, 共通列で結合

2つのTSV文字列, 共通列で結合 (Two TSV Strings, Join on Common Column)

共通のキーFieldに基づいて2つのTSVをマージします。TsvA（左テーブル）の全行を保持し、TsvB（右テーブル）から一致する行を取り込みます〔左外部結合〕。どちらのTSVも事前ソートは不要ですが、TsvBのより上位にある行がマッチング判定されます。

Configs

A1: 存続するTsvAをセットしてください *^#{EL}
A2: TsvAの共通列IDをセットしてください (例 “0” ) *^#{EL}
B1: 吸収されるTsvBをセットしてください *^#{EL}
B2: TsvBの共通列IDをセットしてください (例 “2” ) *^#{EL}
B3: TsvBの結合列IDをセットしてください (例 “0,1,3” )^#{EL}
B4: 代替文字をセットしてください (例 “–,0,NA” )^#{EL}
C1: マージされたTSVが格納される文字列型データ項目を選択してください (更新) *

Script (click to open)

// GraalJS Script (engine type: 2)

//////// START "main()" /////////////////////////////////////////////////////////////////
main();
function main(){ 

//// == Config Retrieving / 工程コンフィグの参照 ==
const strLeftTsv       = configs.get( "StrConfA1" );          /// REQUIRED //////////////
  if( strLeftTsv     === "" ){
    throw new Error( "\n AutomatedTask ConfigError:" +
                     " Config {A1: LeftTsv} is empty \n" );
  }
  const arr2dLeftTsv   = parseAsRectangular( strLeftTsv );
const strLeftTsvKey    = configs.get( "StrConfA2" );          /// REQUIRED //////////////
  if( strLeftTsvKey  === "" ){
    throw new Error( "\n AutomatedTask ConfigError:" +
                     " Config {A2: LeftTsvKey} is empty \n" );
  }
  const numLeftTsvKey  = parseInt( strLeftTsvKey, 10 );
  if( isNaN( numLeftTsvKey ) || numLeftTsvKey < 0 ){
    throw new Error( "\n AutomatedTask ConfigError:" +
                     " Config {A2: LeftTsvKey} must be a positive integer \n" );
  }
  if( numLeftTsvKey   >= arr2dLeftTsv[0].length ){
    throw new Error( "\n AutomatedTask ConfigError:" +
                     " Config {A2: LeftTsvKey} is larger than LeftTsvWidth \n" );
  }

const strRightTsv      = configs.get( "StrConfB1" );          /// REQUIRED //////////////
  if( strRightTsv    === "" ){
    throw new Error( "\n AutomatedTask ConfigError:" +
                     " Config {B1: RightTsv} is empty \n" );
  }
  const arr2dRightTsv  = parseAsRectangular( strRightTsv );
const strRightTsvKey   = configs.get( "StrConfB2" );          /// REQUIRED //////////////
  if( strRightTsvKey === "" ){
    throw new Error( "\n AutomatedTask ConfigError:" +
                     " Config {B2: RightTsvKey} is empty \n" );
  }
  const numRightTsvKey = parseInt( strRightTsvKey, 10 );
  if( isNaN( numRightTsvKey ) || numRightTsvKey < 0 ){
    throw new Error( "\n AutomatedTask ConfigError:" +
                     " Config {B2: RightTsvKey} must be a positive integer \n" );
  }
  if( numRightTsvKey   >= arr2dRightTsv[0].length ){
    throw new Error( "\n AutomatedTask ConfigError:" +
                     " Config {B2: RightTsvKey} is larger than RightTsvWidth \n" );
  }

let   strJoinIds       = configs.get( "StrConfB3" );          // NotRequired ////////////
  if( strJoinIds     === "" ){
    for( let i = 0; i < arr2dRightTsv[0].length; i++ ){
      strJoinIds += i + "";
      if( i !== arr2dRightTsv[0].length - 1 ){ strJoinIds += ","; }
    }
  }
  let arrJoinIds       = strJoinIds.split(",");

let   strJoinStrings   = configs.get( "StrConfB4" );          // NotRequired ////////////
  if( strJoinStrings === "" ){
    for( let i = 0; i < arr2dRightTsv[0].length; i++ ){
      strJoinStrings += "";
      if( i !== arr2dRightTsv[0].length - 1 ){ strJoinIds += ","; }
    }
  }
  let arrJoinStrings   = strJoinStrings.split(",");


const strPocketOutput  = configs.getObject( "SelectConfC1" ); /// REQUIRED //////////////


//// == Data Retrieving / ワークフローデータの参照 ==
// (Nothing. Retrieved via Expression Language in Config Retrieving)


//// == Calculating / 演算 ==

// compare LeftTsv with RightTsv
let strOutput = "";
for( let i = 0; i < arr2dLeftTsv.length; i++ ){
  let boolMatchExists = false;
  for( let j = 0; j < arr2dRightTsv.length; j++ ){
    if( arr2dLeftTsv[i][numLeftTsvKey] === arr2dRightTsv[j][numRightTsvKey] ){
      boolMatchExists = true;
      strOutput += arr2dLeftTsv[i].join("\t") + "\t";
      for( let k = 0; k < arrJoinIds.length; k++ ){
        strOutput += arr2dRightTsv[j][ parseInt(arrJoinIds[k]) ];
        if( k !== arrJoinIds.length - 1 ){
          strOutput += "\t";
        }
      }
      strOutput += "\n";
      break;
    }
  }
  if( ! boolMatchExists ){
    strOutput += arr2dLeftTsv[i].join("\t") + "\t";
    for( let k = 0; k < arrJoinIds.length; k++ ){
      if( k < arrJoinStrings.length ){
        strOutput += arrJoinStrings[k];
      }else{
        strOutput += arrJoinStrings[arrJoinStrings.length - 1];
      }
      if( k !== arrJoinIds.length - 1 ){
        strOutput += "\t";
      }
    }
    strOutput += "\n";
  }
}
strOutput = strOutput.slice( 0, -1 ); // delete last "\n"


//// == Data Updating / ワークフローデータへの代入 ==
engine.setData( strPocketOutput, strOutput );


} //////// END "main()" /////////////////////////////////////////////////////////////////


// Parses TSV string as two-dimensional rectangular data matrix and creates a 2D array.
function parseAsRectangular( strTsv ){
  const arrTsv = strTsv.split("\n");

  /// Get numMinWidth and numMaxWidth (blank lines are excluded)
  let numMinWidth   = Infinity; // cf. String-Type Max: 1 million
  let numMaxWidth   = 0;
  let numBlanklines = 0;
  for( let i = 0; i < arrTsv.length; i++ ){
    if( arrTsv[i] === "" ){ // Skip blank lines
      numBlanklines += 1;
      continue;
    }
    let arrCells = arrTsv[i].split("\t");
    if( numMinWidth > arrCells.length ){ numMinWidth = arrCells.length; }
    if( numMaxWidth < arrCells.length ){ numMaxWidth = arrCells.length; }
  }
  engine.log( " AutomatedTask TsvDataCheck:" + 
              " MinWidth:" + numMinWidth +
              " MaxWidth:" + numMaxWidth +
              " Lines:" + arrTsv.length +
              " (BlankLines:" + numBlanklines + ")" );

  /// Get numMinWidth and numMaxWidth (blank lines are excluded)
  let arr2dTsv      = [];
  for( let i = 0; i < arrTsv.length; i++ ){
    if( arrTsv[i] === "" ){ // Skip blank lines
      continue;
    }
    let arrTmp = [];
    let arrCells = arrTsv[i].split("\t");
    for( let j = 0; j < numMaxWidth; j++ ){
      if( j < arrCells.length ){
        arrTmp[j] = arrCells[j];
      }else{
        arrTmp[j] = "";
      }
    }
    arr2dTsv.push( arrTmp );
  }

  return arr2dTsv;
}

/*
Notes:
- When the process reaches this automated task, TsvA and TsvB are automatically combined.
    - They are combined by so-called "left outer join", and a part of TsvB is taken in based on TsvA.
    - TsvA: Sales log, Monthly sales data, Access log, etc.
    - TsvB: Product master, Previous month sales data, White list, etc.
- TSV (Tab Separated Values) text assumes rectangular data.
    - Data that is not rectangular is automatically formatted with empty characters.
    - Blank lines (including the end) are ignored.
- Set the column ID (starting with zero) for the common column for matching.
    - For each data in the TsvA common column, a matching search is performed for the TsvB common column.
    - The matching is executed in order from the top line, and the search ends as soon as it is found.
    - The matching is case sensitive.
- Set the column IDs to be joined in CSV format. (TsvB columns to be brought)
    - If not set, all columns in TsvB will be joined. (including common columns)

APPENDIX:
- If there is a blank line in the input TSV text, it will be skipped.
    - The line feed code for the last line is not added either.
- The number of cells in output TSV (combined TSV) is uniform for each row. (Rectangle Matrix)
    - If the number of cells in TsvA is not uniform in each row, blank strings will be added.
- Empty strings are added to the unmatched TsvA rows as many as the number of join columns.
    - The number of cells in output TSV (combined TSV) is uniform for each row.
    - To combine non-empty string, set the alternative strings in CSV format.
    - If the list of alternative strings is insufficient, the final string is repeatedly assigned.


Notes-ja:
- 案件が自動処理工程に到達した際、文字列型データに保存されているTsvAとTsvBが自動的に結合されます。
    - いわゆる「左外部結合」にて結合され、TsvAをベースにTsvBの一部が取り込まれます。
    - TsvA: 売上ログ、月間売上データ、アクセスログ、など
    - TsvB: 商品マスタ、前月売上データ、ホワイトリスト、など
- TSV（Tab Separated Values）テキストは、矩形データを前提とします。
    - 矩形でないデータは、空文字によって自動整形されます。
    - 空行（末尾改行を含む）は無視されます。
- マッチングに利用される共通列は、列ID（ゼロ始まり）を設定してください。
    - TsvA共通列の各データについて、TsvB共通列をマッチング探索します。
    - マッチング探索は上の行から順に実行され、発見次第で探索は終了します。
    - マッチング判定において大文字小文字は区別されます。
- マッチング時に結合される列ID（取り込まれるTsvB列）はCSV形式で設定してください。
    - 未設定の場合、TsvBのすべての列が結合されます。（共通列を含む）

APPENDIX-ja:
- 入力TSVテキストに空行がある場合、スキップされます。
    - 最終行の改行コードも付与されません。
- 出力されるTSV（結合されたTSV）のセル数は各行で均一となります。（矩形マトリックス）
    - TsvAのセル数が各行で均一でなかった場合、空文字が追加されます。
- マッチングされなかったTsvAの行には、結合列の数だけ空文字が追加されます。
    - 空文字以外を結合させたい場合は、代替文字をCSV形式で設定してください。
    - 代替文字の列挙が足りない場合、最終の代替文字が繰り返し代入されます。
*/

Download

Two-Tsv-Strings-Join-on-Common-Column-2021.xml
- 2021-08-30 (C) Questetra, Inc. (MIT License)
Two-Tsv-Strings-Join-on-Common-Column-2023.xml
- 2023-08-30 (C) Questetra, Inc. (MIT License)
- for “GraalJS standard (engine-type 3)” on v15.0 or above

Addonファイルのインポートは Professional でのみご利用いただけます。
自由改変可能な JavaScript (ECMAScript) コードです。いかなる保証もありません。

Notes

案件が自動処理工程に到達した際、文字列型データに保存されているTsvAとTsvBが自動的に結合されます。
- いわゆる「左外部結合」にて結合され、TsvAをベースにTsvBの一部が取り込まれます。
- TsvA: 売上ログ、月間売上データ、アクセスログ、など
- TsvB: 商品マスタ、前月売上データ、ホワイトリスト、など
TSV（Tab Separated Values）テキストは、矩形データを前提とします。
- 矩形でないデータは、空文字によって自動整形されます。
- 空行（末尾改行を含む）は無視されます。
マッチングに利用される共通列は、列ID（ゼロ始まり）を設定してください。
- TsvA共通列の各データについて、TsvB共通列をマッチング探索します。
- マッチング探索は上の行から順に実行され、発見次第で探索は終了します。
- マッチング判定において大文字小文字は区別されます。
マッチング時に結合される列ID（取り込まれるTsvB列）はCSV形式で設定してください。
- 未設定の場合、TsvBのすべての列が結合されます。（共通列を含む）

Capture

共通のキーFieldに基づいて2つのTSVをマージします。TsvA（左テーブル）の全行を保持し、TsvB（右テーブル）から一致する行を取り込みます〔左外部結合〕。どちらのTSVも事前ソートは不要ですが、TsvBのより上位にある行がマッチング判定されます。

Appendix

入力TSVテキストに空行がある場合、スキップされます。
- 最終行の改行コードも付与されません。
出力されるTSV（結合されたTSV）のセル数は各行で均一となります。（矩形マトリックス）
- TsvAのセル数が各行で均一でなかった場合、空文字が追加されます。
マッチングされなかったTsvAの行には、結合列の数だけ空文字が追加されます。
- 空文字以外を結合させたい場合は、代替文字をCSV形式で設定してください。
- 代替文字の列挙が足りない場合、最終の代替文字が繰り返し代入されます。