Two TSV Strings, Join on Common Column

Two TSV Strings, Join on Common Column

Two TSV Strings, Join on Common Column
Merges two TSVs based on a common key-field. Keeps all the rows from TsvA (left table) and brings in any matching rows from TsvB (right table); Left Outer Join. Neither TSV needs to be sorted. Only the upper row in TsvB will be matched.
Configs
  • A1: Set Surviving TsvA *#{EL}
  • A2: Set Common Column ID of TsvA (eg “0” ) *#{EL}
  • B1: Set Absorbed TsvB *#{EL}
  • B2: Set Common Column ID of TsvB (eg “2” ) *#{EL}
  • B3: Set Join Column IDs of TsvB (eg “0,1,3” )#{EL}
  • B4: Set Alternative Strings (eg “–,0,NA” )#{EL}
  • C1: Select STRING DATA that stores Merged TSV (update) *
Script (click to open)
// GraalJS Script (engine type: 2)

//////// START "main()" /////////////////////////////////////////////////////////////////
main();
function main(){ 

//// == Config Retrieving / 工程コンフィグの参照 ==
const strLeftTsv       = configs.get( "StrConfA1" );          /// REQUIRED //////////////
  if( strLeftTsv     === "" ){
    throw new Error( "\n AutomatedTask ConfigError:" +
                     " Config {A1: LeftTsv} is empty \n" );
  }
  const arr2dLeftTsv   = parseAsRectangular( strLeftTsv );
const strLeftTsvKey    = configs.get( "StrConfA2" );          /// REQUIRED //////////////
  if( strLeftTsvKey  === "" ){
    throw new Error( "\n AutomatedTask ConfigError:" +
                     " Config {A2: LeftTsvKey} is empty \n" );
  }
  const numLeftTsvKey  = parseInt( strLeftTsvKey, 10 );
  if( isNaN( numLeftTsvKey ) || numLeftTsvKey < 0 ){
    throw new Error( "\n AutomatedTask ConfigError:" +
                     " Config {A2: LeftTsvKey} must be a positive integer \n" );
  }
  if( numLeftTsvKey   >= arr2dLeftTsv[0].length ){
    throw new Error( "\n AutomatedTask ConfigError:" +
                     " Config {A2: LeftTsvKey} is larger than LeftTsvWidth \n" );
  }

const strRightTsv      = configs.get( "StrConfB1" );          /// REQUIRED //////////////
  if( strRightTsv    === "" ){
    throw new Error( "\n AutomatedTask ConfigError:" +
                     " Config {B1: RightTsv} is empty \n" );
  }
  const arr2dRightTsv  = parseAsRectangular( strRightTsv );
const strRightTsvKey   = configs.get( "StrConfB2" );          /// REQUIRED //////////////
  if( strRightTsvKey === "" ){
    throw new Error( "\n AutomatedTask ConfigError:" +
                     " Config {B2: RightTsvKey} is empty \n" );
  }
  const numRightTsvKey = parseInt( strRightTsvKey, 10 );
  if( isNaN( numRightTsvKey ) || numRightTsvKey < 0 ){
    throw new Error( "\n AutomatedTask ConfigError:" +
                     " Config {B2: RightTsvKey} must be a positive integer \n" );
  }
  if( numRightTsvKey   >= arr2dRightTsv[0].length ){
    throw new Error( "\n AutomatedTask ConfigError:" +
                     " Config {B2: RightTsvKey} is larger than RightTsvWidth \n" );
  }

let   strJoinIds       = configs.get( "StrConfB3" );          // NotRequired ////////////
  if( strJoinIds     === "" ){
    for( let i = 0; i < arr2dRightTsv[0].length; i++ ){
      strJoinIds += i + "";
      if( i !== arr2dRightTsv[0].length - 1 ){ strJoinIds += ","; }
    }
  }
  let arrJoinIds       = strJoinIds.split(",");

let   strJoinStrings   = configs.get( "StrConfB4" );          // NotRequired ////////////
  if( strJoinStrings === "" ){
    for( let i = 0; i < arr2dRightTsv[0].length; i++ ){
      strJoinStrings += "";
      if( i !== arr2dRightTsv[0].length - 1 ){ strJoinIds += ","; }
    }
  }
  let arrJoinStrings   = strJoinStrings.split(",");


const strPocketOutput  = configs.getObject( "SelectConfC1" ); /// REQUIRED //////////////


//// == Data Retrieving / ワークフローデータの参照 ==
// (Nothing. Retrieved via Expression Language in Config Retrieving)


//// == Calculating / 演算 ==

// compare LeftTsv with RightTsv
let strOutput = "";
for( let i = 0; i < arr2dLeftTsv.length; i++ ){
  let boolMatchExists = false;
  for( let j = 0; j < arr2dRightTsv.length; j++ ){
    if( arr2dLeftTsv[i][numLeftTsvKey] === arr2dRightTsv[j][numRightTsvKey] ){
      boolMatchExists = true;
      strOutput += arr2dLeftTsv[i].join("\t") + "\t";
      for( let k = 0; k < arrJoinIds.length; k++ ){
        strOutput += arr2dRightTsv[j][ parseInt(arrJoinIds[k]) ];
        if( k !== arrJoinIds.length - 1 ){
          strOutput += "\t";
        }
      }
      strOutput += "\n";
      break;
    }
  }
  if( ! boolMatchExists ){
    strOutput += arr2dLeftTsv[i].join("\t") + "\t";
    for( let k = 0; k < arrJoinIds.length; k++ ){
      if( k < arrJoinStrings.length ){
        strOutput += arrJoinStrings[k];
      }else{
        strOutput += arrJoinStrings[arrJoinStrings.length - 1];
      }
      if( k !== arrJoinIds.length - 1 ){
        strOutput += "\t";
      }
    }
    strOutput += "\n";
  }
}
strOutput = strOutput.slice( 0, -1 ); // delete last "\n"


//// == Data Updating / ワークフローデータへの代入 ==
engine.setData( strPocketOutput, strOutput );


} //////// END "main()" /////////////////////////////////////////////////////////////////


// Parses TSV string as two-dimensional rectangular data matrix and creates a 2D array.
function parseAsRectangular( strTsv ){
  const arrTsv = strTsv.split("\n");

  /// Get numMinWidth and numMaxWidth (blank lines are excluded)
  let numMinWidth   = Infinity; // cf. String-Type Max: 1 million
  let numMaxWidth   = 0;
  let numBlanklines = 0;
  for( let i = 0; i < arrTsv.length; i++ ){
    if( arrTsv[i] === "" ){ // Skip blank lines
      numBlanklines += 1;
      continue;
    }
    let arrCells = arrTsv[i].split("\t");
    if( numMinWidth > arrCells.length ){ numMinWidth = arrCells.length; }
    if( numMaxWidth < arrCells.length ){ numMaxWidth = arrCells.length; }
  }
  engine.log( " AutomatedTask TsvDataCheck:" + 
              " MinWidth:" + numMinWidth +
              " MaxWidth:" + numMaxWidth +
              " Lines:" + arrTsv.length +
              " (BlankLines:" + numBlanklines + ")" );

  /// Get numMinWidth and numMaxWidth (blank lines are excluded)
  let arr2dTsv      = [];
  for( let i = 0; i < arrTsv.length; i++ ){
    if( arrTsv[i] === "" ){ // Skip blank lines
      continue;
    }
    let arrTmp = [];
    let arrCells = arrTsv[i].split("\t");
    for( let j = 0; j < numMaxWidth; j++ ){
      if( j < arrCells.length ){
        arrTmp[j] = arrCells[j];
      }else{
        arrTmp[j] = "";
      }
    }
    arr2dTsv.push( arrTmp );
  }

  return arr2dTsv;
}

/*
Notes:
- When the process reaches this automated task, TsvA and TsvB are automatically combined.
    - They are combined by so-called "left outer join", and a part of TsvB is taken in based on TsvA.
    - TsvA: Sales log, Monthly sales data, Access log, etc.
    - TsvB: Product master, Previous month sales data, White list, etc.
- TSV (Tab Separated Values) text assumes rectangular data.
    - Data that is not rectangular is automatically formatted with empty characters.
    - Blank lines (including the end) are ignored.
- Set the column ID (starting with zero) for the common column for matching.
    - For each data in the TsvA common column, a matching search is performed for the TsvB common column.
    - The matching is executed in order from the top line, and the search ends as soon as it is found.
    - The matching is case sensitive.
- Set the column IDs to be joined in CSV format. (TsvB columns to be brought)
    - If not set, all columns in TsvB will be joined. (including common columns)

APPENDIX:
- If there is a blank line in the input TSV text, it will be skipped.
    - The line feed code for the last line is not added either.
- The number of cells in output TSV (combined TSV) is uniform for each row. (Rectangle Matrix)
    - If the number of cells in TsvA is not uniform in each row, blank strings will be added.
- Empty strings are added to the unmatched TsvA rows as many as the number of join columns.
    - The number of cells in output TSV (combined TSV) is uniform for each row.
    - To combine non-empty string, set the alternative strings in CSV format.
    - If the list of alternative strings is insufficient, the final string is repeatedly assigned.


Notes-ja:
- 案件が自動処理工程に到達した際、文字列型データに保存されているTsvAとTsvBが自動的に結合されます。
    - いわゆる「左外部結合」にて結合され、TsvAをベースにTsvBの一部が取り込まれます。
    - TsvA: 売上ログ、月間売上データ、アクセスログ、など
    - TsvB: 商品マスタ、前月売上データ、ホワイトリスト、など
- TSV(Tab Separated Values)テキストは、矩形データを前提とします。
    - 矩形でないデータは、空文字によって自動整形されます。
    - 空行(末尾改行を含む)は無視されます。
- マッチングに利用される共通列は、列ID(ゼロ始まり)を設定してください。
    - TsvA共通列の各データについて、TsvB共通列をマッチング探索します。
    - マッチング探索は上の行から順に実行され、発見次第で探索は終了します。
    - マッチング判定において大文字小文字は区別されます。
- マッチング時に結合される列ID(取り込まれるTsvB列)はCSV形式で設定してください。
    - 未設定の場合、TsvBのすべての列が結合されます。(共通列を含む)

APPENDIX-ja:
- 入力TSVテキストに空行がある場合、スキップされます。
    - 最終行の改行コードも付与されません。
- 出力されるTSV(結合されたTSV)のセル数は各行で均一となります。(矩形マトリックス)
    - TsvAのセル数が各行で均一でなかった場合、空文字が追加されます。
- マッチングされなかったTsvAの行には、結合列の数だけ空文字が追加されます。
    - 空文字以外を結合させたい場合は、代替文字をCSV形式で設定してください。
    - 代替文字の列挙が足りない場合、最終の代替文字が繰り返し代入されます。
*/

Download

The Add-on import feature is available with Professional edition.
Freely modifiable JavaScript (ECMAScript) code. No warranty of any kind.

Notes

  • When the process reaches this automated task, TsvA and TsvB are automatically combined.
    • They are combined by the so-called “left outer join”, and a part of TsvB is taken in based on TsvA.
    • TsvA: Sales log, Monthly sales data, Access log, etc.
    • TsvB: Product master, Previous month sales data, White list, etc.
  • TSV (Tab Separated Values) text assumes rectangular data.
    • Data that is not rectangular is automatically formatted with empty characters.
    • Blank lines (including the end) are ignored.
  • Set the column ID (starting with zero) for the common columns used for matching.
    • For each data in the TsvA common column, a matching search is performed for the TsvB common column.
    • The matching is executed in order from the top line, and the search ends as soon as it is found.
    • The matching is case sensitive.
  • Set the column IDs to be joined in CSV format. (TsvB columns to be imported)
    • If not set, all columns in TsvB will be merged. (including common columns)

Capture

Merges two TSVs based on a common key-field. Keeps all the rows from the TsvA (left table) and brings in any matching rows from the TsvB (right table); Left Outer Join. Neither TSV needs to be sorted. Only the upper row in the TsvB will be matched.
Merges two TSVs based on a common key-field. Keeps all the rows from the TsvA (left table) and brings in any matching rows from the TsvB (right table); Left Outer Join. Neither TSV needs to be sorted. Only the upper row in the TsvB will be matched.

Appendix

  • If there is a blank line in the input TSV text, it will be skipped.
    • The line feed code for the last line is not added either.
  • The number of cells in the (merged) output TSV is uniform for each row. (Rectangle Matrix)
    • If the number of cells in TsvA is not uniform in each row, blank strings will be added.
  • As many empty strings are added to the unmatched TsvA rows as the number of join columns.
    • The number of cells in the output TSV (combined TSV) is uniform for each row.
    • To combine non-empty strings, set the alternative strings in CSV format.
    • If the list of alternative strings is insufficient, the final string is repeatedly assigned.

See also

1 thought on “Two TSV Strings, Join on Common Column”

  1. Pingback: TSV String, Calc Changes in Two Numeric Columns – Questetra Support

Leave a Reply

This site uses Akismet to reduce spam. Learn how your comment data is processed.

Scroll to Top

Discover more from Questetra Support

Subscribe now to keep reading and get access to the full archive.

Continue reading