File: //home/arjun/projects/env/lib/python3.10/site-packages/elasticsearch/_sync/client/text_structure.py
# Licensed to Elasticsearch B.V. under one or more contributor
# license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright
# ownership. Elasticsearch B.V. licenses this file to you under
# the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import typing as t
from elastic_transport import ObjectApiResponse
from ._base import NamespacedClient
from .utils import _rewrite_parameters
class TextStructureClient(NamespacedClient):
@_rewrite_parameters(
body_name="text_files",
)
def find_structure(
self,
*,
text_files: t.Sequence[t.Any],
charset: t.Optional[str] = None,
column_names: t.Optional[str] = None,
delimiter: t.Optional[str] = None,
explain: t.Optional[bool] = None,
format: t.Optional[str] = None,
grok_pattern: t.Optional[str] = None,
has_header_row: t.Optional[bool] = None,
line_merge_size_limit: t.Optional[int] = None,
lines_to_sample: t.Optional[int] = None,
quote: t.Optional[str] = None,
should_trim_fields: t.Optional[bool] = None,
timeout: t.Optional[t.Union["t.Literal[-1]", "t.Literal[0]", str]] = None,
timestamp_field: t.Optional[str] = None,
timestamp_format: t.Optional[str] = None,
) -> ObjectApiResponse[t.Any]:
"""
Finds the structure of a text file. The text file must contain data that is suitable
to be ingested into Elasticsearch.
`<https://www.elastic.co/guide/en/elasticsearch/reference/8.11/find-structure.html>`_
:param text_files:
:param charset: The text’s character set. It must be a character set that is
supported by the JVM that Elasticsearch uses. For example, UTF-8, UTF-16LE,
windows-1252, or EUC-JP. If this parameter is not specified, the structure
finder chooses an appropriate character set.
:param column_names: If you have set format to delimited, you can specify the
column names in a comma-separated list. If this parameter is not specified,
the structure finder uses the column names from the header row of the text.
If the text does not have a header role, columns are named "column1", "column2",
"column3", etc.
:param delimiter: If you have set format to delimited, you can specify the character
used to delimit the values in each row. Only a single character is supported;
the delimiter cannot have multiple characters. By default, the API considers
the following possibilities: comma, tab, semi-colon, and pipe (|). In this
default scenario, all rows must have the same number of fields for the delimited
format to be detected. If you specify a delimiter, up to 10% of the rows
can have a different number of columns than the first row.
:param explain: If this parameter is set to true, the response includes a field
named explanation, which is an array of strings that indicate how the structure
finder produced its result.
:param format: The high level structure of the text. Valid values are ndjson,
xml, delimited, and semi_structured_text. By default, the API chooses the
format. In this default scenario, all rows must have the same number of fields
for a delimited format to be detected. If the format is set to delimited
and the delimiter is not set, however, the API tolerates up to 5% of rows
that have a different number of columns than the first row.
:param grok_pattern: If you have set format to semi_structured_text, you can
specify a Grok pattern that is used to extract fields from every message
in the text. The name of the timestamp field in the Grok pattern must match
what is specified in the timestamp_field parameter. If that parameter is
not specified, the name of the timestamp field in the Grok pattern must match
"timestamp". If grok_pattern is not specified, the structure finder creates
a Grok pattern.
:param has_header_row: If you have set format to delimited, you can use this
parameter to indicate whether the column names are in the first row of the
text. If this parameter is not specified, the structure finder guesses based
on the similarity of the first row of the text to other rows.
:param line_merge_size_limit: The maximum number of characters in a message when
lines are merged to form messages while analyzing semi-structured text. If
you have extremely long messages you may need to increase this, but be aware
that this may lead to very long processing times if the way to group lines
into messages is misdetected.
:param lines_to_sample: The number of lines to include in the structural analysis,
starting from the beginning of the text. The minimum is 2; If the value of
this parameter is greater than the number of lines in the text, the analysis
proceeds (as long as there are at least two lines in the text) for all of
the lines.
:param quote: If you have set format to delimited, you can specify the character
used to quote the values in each row if they contain newlines or the delimiter
character. Only a single character is supported. If this parameter is not
specified, the default value is a double quote ("). If your delimited text
format does not use quoting, a workaround is to set this argument to a character
that does not appear anywhere in the sample.
:param should_trim_fields: If you have set format to delimited, you can specify
whether values between delimiters should have whitespace trimmed from them.
If this parameter is not specified and the delimiter is pipe (|), the default
value is true. Otherwise, the default value is false.
:param timeout: Sets the maximum amount of time that the structure analysis make
take. If the analysis is still running when the timeout expires then it will
be aborted.
:param timestamp_field: Optional parameter to specify the timestamp field in
the file
:param timestamp_format: The Java time format of the timestamp field in the text.
"""
if text_files is None:
raise ValueError("Empty value passed for parameter 'text_files'")
__path = "/_text_structure/find_structure"
__query: t.Dict[str, t.Any] = {}
if charset is not None:
__query["charset"] = charset
if column_names is not None:
__query["column_names"] = column_names
if delimiter is not None:
__query["delimiter"] = delimiter
if explain is not None:
__query["explain"] = explain
if format is not None:
__query["format"] = format
if grok_pattern is not None:
__query["grok_pattern"] = grok_pattern
if has_header_row is not None:
__query["has_header_row"] = has_header_row
if line_merge_size_limit is not None:
__query["line_merge_size_limit"] = line_merge_size_limit
if lines_to_sample is not None:
__query["lines_to_sample"] = lines_to_sample
if quote is not None:
__query["quote"] = quote
if should_trim_fields is not None:
__query["should_trim_fields"] = should_trim_fields
if timeout is not None:
__query["timeout"] = timeout
if timestamp_field is not None:
__query["timestamp_field"] = timestamp_field
if timestamp_format is not None:
__query["timestamp_format"] = timestamp_format
__body = text_files
__headers = {
"accept": "application/json",
"content-type": "application/x-ndjson",
}
return self.perform_request( # type: ignore[return-value]
"POST", __path, params=__query, headers=__headers, body=__body
)