#TSV-String: Extract by RegExp for each Line

#TSV-String: Extract by RegExp for each Line

#TSV-String: Extract by RegExp for each Line

translate #TSV文字列: 各行ごとに正規表現で抽出

Extracts a string that matches the RegExp for each line. For example, if you set an email address regular expression, the email address in the specified cell will be added as a new cell at the end of the line. If there is no match, an empty string will be added. It is also possible to limit extraction to regular expression capture groups by specifying the capture group ID (from 1).

Auto Step icon
Configs for this Auto Step
StrConfA
A: Set Input TSV String *#{EL}
StrConfB1
B1: Set Regular Expression (eg: “(\d{3}-\d{4})|(\d{7})”) *#{EL}
BoolConfB2
B2: Case Sensitive or Case should be Ignored
StrConfB3
B3: Set Column IDs (Cell IDs) to Search for (eg: “0,2,3”)#{EL}
StrConfB4
B4: If necessary, Set Capturing Group IDs (eg “0,1”)#{EL}
SelectConfC
C: Select DATA to store Output TSV Strings (update) *
SelectConfD1
D1: Select NUMERIC for Number of TSV Lines (update)
SelectConfD2
D2: Select NUMERIC for Number of TSV Lines Trimmed (update)
Script (click to open)
// Script Example of Business Process Automation
// for 'engine type: 3' ("GraalJS standard mode")
// cf. 'engine type: 2' ("GraalJS Nashorn compatible mode") (renamed from "GraalJS" at 20230526)


//////// START "main()" /////////////////////////////////////////////////////////////////
main();
function main(){ 

//// == Config Retrieving / 工程コンフィグの参照 ==
const strTsv        = configs.get        ( "StrConfA" );     // REQUIRED
  if( strTsv === "" ){
    throw new Error( "\n AutomatedTask ConfigError:" +
                     " Config {A: TSV} is empty \n" );
  }
  const numTsvLines = strTsv.split("\n").length;
  const numTsvNoLf  = strTsv.replace(/[\n]*$/, "").split("\n").length;
                      // delete Line Feed codes at the end
  // get TSV as Array-Array (2d Array)
  const arr2dTsv    = parseAsRectangular ( strTsv );  // [row,col]

const strRegExp      = configs.get       ( "StrConfB1" );     // REQUIRED
const boolIgnoreCase = configs.getObject ( "BoolConfB2" );    // TOGGLE
  // https://questetra.zendesk.com/hc/ja/articles/360024574471-R2300 "Boolean object"

const strSearchCols  = configs.get       ( "StrConfB3" )  === "" ? // not required
                      "0" : // default
                       configs.get       ( "StrConfB3" ) ; 
const arrSearchCols  = strSearchCols.split(",");

const strCapturing   = configs.get       ( "StrConfB4" )  === "" ? // not required
                       "0" : // default
                       configs.get       ( "StrConfB4" ) ; 
const arrCapturing   = strCapturing.split(",");

const strPocketC     = configs.getObject ( "SelectConfC" );   // REQUIRED
const numPocketD1    = configs.getObject ( "SelectConfD1" );  // not required
const numPocketD2    = configs.getObject ( "SelectConfD2" );  // not required



//// == Data Retrieving / ワークフローデータの参照 ==
// (nothing)



//// == Calculating / 演算 ==
const regSearch  = boolIgnoreCase ?
                   new RegExp( strRegExp, 'i' ) : new RegExp( strRegExp );

let arrOutputTsv  = [];

for ( let i = 0; i < arr2dTsv.length; i++ ) {
  let strTarget = "";
  for ( let j = 0; j < arrSearchCols.length; j++ ) {
    strTarget += arr2dTsv[i][ parseInt(arrSearchCols[j], 10) ];
    if ( j !== arrSearchCols.length - 1 ) {
      strTarget += "\t";
    }
  }

  let arrFound = strTarget.match ( regSearch );
  engine.log( " AutomatedTask Match Index: " + arrFound?.index );
  let arrTmp = [...arr2dTsv[i]]; // Spread syntax
  for ( let k = 0; k < arrCapturing.length; k++ ) {
    let strMatch = arrFound?.[ parseInt(arrCapturing[k], 10) ] ?? ""; // Optional Chaining and Nullish Coalescing
    arrTmp.push ( strMatch );
  }
  arrOutputTsv.push ( arrTmp?.join( '\t' ) ?? "" );
}



//// == Data Updating / ワークフローデータへの代入 ==
/// ref) Retrieving / Updating from ScriptTasks
/// https://questetra.zendesk.com/hc/en-us/articles/360024574771-R2301
/// https://questetra.zendesk.com/hc/ja/articles/360024574771-R2301

if ( strPocketC !== null ){ 
  engine.setData( strPocketC, arrOutputTsv?.join( '\n' ) ?? "" );
}

if ( numPocketD1 !== null ){ 
  engine.setData( numPocketD1, new java.math.BigDecimal( numTsvLines ) );
}
if ( numPocketD2 !== null ){ 
  engine.setData( numPocketD2, new java.math.BigDecimal( numTsvNoLf ) );
}

} //////// END "main()" /////////////////////////////////////////////////////////////////



// Parses TSV string as two-dimensional rectangular data matrix and creates a 2D array.
function parseAsRectangular( strTsv ){
  const arrTsv = strTsv.split("\n");

  /// Get numMinWidth and numMaxWidth (blank lines are excluded)
  let numMinWidth   = Infinity; // cf. String-Type Max: 1 million
  let numMaxWidth   = 0;
  let numBlanklines = 0;
  for( let i = 0; i < arrTsv.length; i++ ){
    if( arrTsv[i] === "" ){ // Skip blank lines
      numBlanklines += 1;
      continue;
    }
    let arrCells = arrTsv[i].split("\t");
    if( numMinWidth > arrCells.length ){ numMinWidth = arrCells.length; }
    if( numMaxWidth < arrCells.length ){ numMaxWidth = arrCells.length; }
  }
  engine.log( " AutomatedTask TsvDataCheck:" + 
              " MinWidth:" + numMinWidth +
              " MaxWidth:" + numMaxWidth +
              " Lines:" + arrTsv.length +
              " (BlankLines:" + numBlanklines + ")" );

  /// Get numMinWidth and numMaxWidth (blank lines are excluded)
  let arr2dTsv      = [];
  for( let i = 0; i < arrTsv.length; i++ ){
    if( arrTsv[i] === "" ){ // Skip blank lines
      continue;
    }
    let arrTmp = [];
    let arrCells = arrTsv[i].split("\t");
    for( let j = 0; j < numMaxWidth; j++ ){
      if( j < arrCells.length ){
        arrTmp[j] = arrCells[j];
      }else{
        arrTmp[j] = "";
      }
    }
    arr2dTsv.push( arrTmp );
  }

  return arr2dTsv;
}



/*
### NOTES
- When a Process reaches this [Automated Step], the extraction will be automatically executed.
    - Any regular expression can be set in the Automatic Step configuration.
- A string that matches the RegExp is extracted for each line of the TSV text.
    - Only the first match is extracted.
    - The search range (cells) can be specified. (eg: "0,2,3") (default: "0")
    - The string that matches is added to the end of each line. (The number of TSV columns increases.)
    - If there is no match, the empty string `"" ` is added.
    - If you set `\d{3}-\d{4}`, strings such as "150-8001" and "604-0835" will be extracted.
- Partial extractions are also possible using a capturing group `()` in a regular expression.
    - ID "0": "Matched string" (Full extraction)
    - ID "1": "String matched by the first capture group" (Partial extraction)
    - If you set `(\d{3})-(\d{4})`, any or all of "150-8001", "150", and "8001" are added.
    - The number of TSV columns increases according to the number of setting IDs.
- The number of lines in the TSV text can also be recorded.
- Regular Expressions
    - https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_expressions

### NOTES-ja
- この[自動工程]に案件が到達すると、「抽出処理」が自動実行されます。
    - 自動工程のコンフィグ(設定)には、任意の正規表現が設定できます。
- 正規表現に「マッチした文字列」が、TSVテキスト各行ごとに抽出されます。
    - 一番最初に一致したものだけが抽出されます。
    - 検索範囲(セルIDs)を指定できます。(例: "0,2,3")(デフォルト範囲: "0")
    - 各行の末尾に「マッチした文字列」が追加されます。(TSV列数が増えます)
    - マッチしなければ、空文字列 "" が追加されます。
    - たとえば `\d{3}-\d{4}` と設定しておけば "150-8001" や "604-0835" といった文字列が抽出されます
- 正規表現内にキャプチャグループ `()` を使用すれば、部分抽出も可能です。
    - ID "0": 「マッチした文字列」(全体抽出)
    - ID "1": 「"1番目のキャプチャグループ" がマッチした文字列」(部分抽出)
    - `(\d{3})-(\d{4})` と設定すれば "150-8001", "150", "8001" のいずれかもしくは全てを抽出できます。
    - 設定IDの数に応じて、TSV列数が増えます。
- 入力TSVテキストの行数も記録可能です。
- 正規表現とは
    - https://developer.mozilla.org/ja/docs/Web/JavaScript/Guide/Regular_expressions


### Input example
NHK放送センター	150-8001 東京都渋谷区神南2丁目2-1
株式会社クエステトラ	604-0835 京都市中京区御池通間之町東入高宮町206御池ビル4階
NHK放送センター東京	150-8001 東京都渋谷区神南2丁目2-1	150-8001	150	8001	
株式会社クエステトラ	604-0835 京都市中京区御池通間之町東入高宮町206御池ビル4階	604-0835	604	0835	
### Output example using `\d{3}-\d{4}`
NHK放送センター	150-8001 東京都渋谷区神南2丁目2-1	150-8001
株式会社クエステトラ	604-0835 京都市中京区御池通間之町東入高宮町206御池ビル4階	604-0835

### Output example using `(\d{3})-(\d{4})` capturing `0,1,2`
NHK放送センター	150-8001 東京都渋谷区神南2丁目2-1	150-8001	150	8001
株式会社クエステトラ	604-0835 京都市中京区御池通間之町東入高宮町206御池ビル4階	604-0835	604	0835



APPENDIX
- 正規表現 設定例 / RegExp Example
    - 日本の郵便番号 / Japanese Postal Code
        - `\d{3}-\d{4}`
        - `[0-9]{3}-[0-9]{4}`
        - `(\d{3})-(\d{4})`
        - `(\d{3}-\d{4})`
        - `(\d{3}-\d{4})|(\d{7})`
        - `([0-9]{3}-[0-9]{4})|([0-9]{7})`
    - Email
        - `[\w_.+-]+@[\w-.]+[a-zA-Z]{2,}`
        - `[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-.]+[a-zA-Z]{2,}`
        - `[a-zA-Z0-9_.+-]+@([a-zA-Z0-9-.]+[a-zA-Z]{2,})`
        - `[a-zA-Z0-9_.+-]+@([a-zA-Z0-9][a-zA-Z0-9-]*[a-zA-Z0-9]*\.)+[a-zA-Z]{2,}`
        - `[a-zA-Z0-9_.+-]+@(([a-zA-Z0-9][a-zA-Z0-9-]*[a-zA-Z0-9]*\.)+[a-zA-Z]{2,})`
    - URL
        - `https?://[\w/:%#\$&\?\(\)~\.=\+\-]+`
    - ISO Date from 2024-12-15 to 2025-01-06
        - `(2024-12-1[5-9])|(2024-12-[2-3][0-9])|(2025-01-0[1-6])`
- 正規表現グループ / Groups in regular expressions
    - `(apple|orange)`
        - "apple" か "orange" のいずれか / Either "apple" or "orange"
    - 正規表現は複数のキャプチャグループを持つことができます。
        - キャプチャグループ内の左括弧と同じ順にある、配列要素に一致します。
    - A regular expression may have multiple capturing groups.
        - Matches to capturing groups in an array whose members are in the same order as the left parentheses
- 正規表現文字クラス / A character class
    - `[xyz]`
        - "x" か "y" か "z" の 一文字 / Any one of the enclosed characters, "x" "y" or "z"
    - `[a-c]`
        - "a" から "c" までのいずれか一文字 / Any one in the range "a" to "c"
    - `[^xyz]`
        - "x" でも "y" でも "z" でもない一文字 / Any one that is neither "x" nor "y" nor "z"
    - `\d`
        - 数字一文字 / Any digit / `[0-9]`
    - `\w`
        - 半角英数字一文字 / Any alphanumeric character / `[A-Za-z0-9_]`
    - `\t`
        - 水平タブ / A horizontal tab
    - `.`
        - 改行文字を除くあらゆる一文字 / Any single character except line terminators
- 正規表現アサーション言明 / Assertions
    - `^`
        - 先頭 / Beginning
    - `$`
        - 末尾 / End
    - `\b`
        - 区切り / A word boundary
    - `\B`
        - 区切り以外 / A non-word boundary
- 正規表現数量詞 / Quantifiers
    - `x*`
        - 直前アイテム "x" の0回以上の繰返 / The preceding item "x" 0 or more times
    - `x+`
        - 直前アイテム "x" の1回以上の繰返 / The preceding item "x" 1 or more times
    - `x?`
        - 直前アイテム "x" の0回か1回の出現 / The preceding item "x" 0 or 1 times
    - `x{n}`
        - 直前アイテム "x" のn回の出現 / "n" occurrences of the preceding item "x"
    - `x{n,m}`
        - 直前アイテム "x" がnからm回出現 / at least "n" and at most "m" occurrences

*/

Download

warning Freely modifiable JavaScript (ECMAScript) code. No warranty of any kind.
(Installing Addon Auto-Steps are available only on the Professional edition.)

Notes

  • When a Process reaches this [Automated Step], the extraction is automatically executed.
    • Any regular expression can be set in the Automatic Step configuration.
  • A string that matches the RegExp is extracted for each line of the TSV text.
    • Only the first match is extracted.
    • The search range (cells) can be specified. (eg: “0,2,3”) (default: “0”)
    • The string that matches is added to the end of each line. (The number of TSV columns increases.)
    • If there is no match, the empty string ""is added.
    • If you set \d{3}-\d{4}, strings such as “150-8001” and “604-0835” will be extracted.
  • Partial extractions are also possible using a capturing group () in a regular expression.
    • ID “0”: “Matched string” (Full extraction)
    • ID “1”: “String matched by the first capture group” (Partial extraction)
    • If you set (\d{3})-(\d{4}), any or all of “150-8001”, “150”, and “8001” are added.
    • The number of TSV columns increases according to the number of setting IDs.
  • The number of lines in the TSV text can also be recorded.
  • Regular Expressions

Capture

Appendix

  • 正規表現 設定例 / RegExp Example
    • 日本の郵便番号 / Japanese Postal Code
      • \d{3}-\d{4}
      • [0-9]{3}-[0-9]{4}
      • (\d{3})-(\d{4})
      • (\d{3}-\d{4})
      • (\d{3}-\d{4})|(\d{7})
      • ([0-9]{3}-[0-9]{4})|([0-9]{7})
    • Email
      • [\w_.+-]+@[\w-.]+[a-zA-Z]{2,}
      • [a-zA-Z0-9_.+-]+@[a-zA-Z0-9-.]+[a-zA-Z]{2,}
      • [a-zA-Z0-9_.+-]+@([a-zA-Z0-9-.]+[a-zA-Z]{2,})
      • [a-zA-Z0-9_.+-]+@([a-zA-Z0-9][a-zA-Z0-9-]*[a-zA-Z0-9]*\.)+[a-zA-Z]{2,}
      • [a-zA-Z0-9_.+-]+@(([a-zA-Z0-9][a-zA-Z0-9-]*[a-zA-Z0-9]*\.)+[a-zA-Z]{2,})
    • URL
      • https?://[\w/:%#\$&\?\(\)~\.=\+\-]+
    • ISO Date from 2024-12-15 to 2025-01-06
      • (2024-12-1[5-9])|(2024-12-[2-3][0-9])|(2025-01-0[1-6])
  • 正規表現グループ / Groups in regular expressions
    • (apple|orange)
      • “apple” か “orange” のいずれか / Either “apple” or “orange”
    • 正規表現は複数のキャプチャグループを持つことができます。
      • キャプチャグループ内の左括弧と同じ順にある、配列要素に一致します。
    • A regular expression may have multiple capturing groups.
      • Matches to capturing groups in an array whose members are in the same order as the left parentheses
  • 正規表現文字クラス / A character class
    • [xyz]
      • “x” か “y” か “z” の 一文字 / Any one of the enclosed characters, “x” “y” or “z”
    • [a-c]
      • “a” から “c” までのいずれか一文字 / Any one in the range “a” to “c”
    • [^xyz]
      • “x” でも “y” でも “z” でもない一文字 / Any one that is neither “x” nor “y” nor “z”
    • \d
      • 数字一文字 / Any digit / [0-9]
    • \w
      • 半角英数字一文字 / Any alphanumeric character / [A-Za-z0-9_]
    • \t
      • 水平タブ / A horizontal tab
    • .
      • 改行文字を除くあらゆる一文字 / Any single character except line terminators
  • 正規表現アサーション言明 / Assertions
    • ^
      • 先頭 / Beginning
    • $
      • 末尾 / End
    • \b
      • 区切り / A word boundary
    • \B
      • 区切り以外 / A non-word boundary
  • 正規表現数量詞 / Quantifiers
    • x*
      • 直前アイテム “x” の0回以上の繰返 / The preceding item “x” 0 or more times
    • x+
      • 直前アイテム “x” の1回以上の繰返 / The preceding item “x” 1 or more times
    • x?
      • 直前アイテム “x” の0回か1回の出現 / The preceding item “x” 0 or 1 times
    • x{n}
      • 直前アイテム “x” のn回の出現 / ”n” occurrences of the preceding item “x”
    • x{n,m}
      • 直前アイテム “x” がnからm回出現 / at least “n” and at most “m” occurrences

Input example

NHK放送センター    150-8001 東京都渋谷区神南2丁目2-1
株式会社クエステトラ    604-0835 京都市中京区御池通間之町東入高宮町206御池ビル4階

Output example using \d{3}-\d{4}

NHK放送センター    150-8001 東京都渋谷区神南2丁目2-1 150-8001
株式会社クエステトラ    604-0835 京都市中京区御池通間之町東入高宮町206御池ビル4階 604-0835

Output example using (\d{3})-(\d{4}) capturing 0,1,2

NHK放送センター    150-8001 東京都渋谷区神南2丁目2-1 150-8001    150 8001
株式会社クエステトラ    604-0835 京都市中京区御池通間之町東入高宮町206御池ビル4階 604-0835    604 0835

See Also

Scroll to Top

Discover more from Questetra Support

Subscribe now to keep reading and get access to the full archive.

Continue reading