/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package gobblin.type;
import java.util.List;
import java.util.concurrent.ConcurrentHashMap;
import lombok.extern.slf4j.Slf4j;
import gobblin.metadata.types.Metadata;
/**
* Utilities to work with MIME content-types
*/
@Slf4j
public class ContentTypeUtils {
private static final ContentTypeUtils INSTANCE = new ContentTypeUtils();
public static ContentTypeUtils getInstance() {
return INSTANCE;
}
private ConcurrentHashMap<String, String> knownCharsets;
/**
* Check which character set a given content-type corresponds to.
* @param contentType Content-type to check
* @return Charset the mimetype represents. "BINARY" if binary data.
*/
public String getCharset(String contentType) {
String charSet = knownCharsets.get(contentType);
if (charSet != null) {
return charSet;
}
// Special cases
if (contentType.startsWith("text/") || contentType.endsWith("+json") || contentType.endsWith("+xml")) {
return "UTF-8";
}
return "BINARY";
}
/**
* Heuristic to infer if content is printable from metadata.
*/
public boolean inferPrintableFromMetadata(Metadata md) {
String inferredCharset = "BINARY";
List<String> transferEncoding = md.getGlobalMetadata().getTransferEncoding();
if (transferEncoding != null) {
inferredCharset = getCharset(transferEncoding.get(transferEncoding.size() - 1));
} else if (md.getGlobalMetadata().getContentType() != null) {
inferredCharset = getCharset(md.getGlobalMetadata().getContentType());
}
return inferredCharset.equals("UTF-8");
}
/**
* Register a new contentType to charSet mapping.
* @param contentType Content-type to register
* @param charSet charSet associated with the content-type
*/
public void registerCharsetMapping(String contentType, String charSet) {
if (knownCharsets.contains(contentType)) {
log.warn("{} is already registered; re-registering");
}
knownCharsets.put(contentType, charSet);
}
private ContentTypeUtils() {
knownCharsets = new ConcurrentHashMap<>();
knownCharsets.put("base64", "UTF-8");
knownCharsets.put("aes_rotating", "UTF-8");
knownCharsets.put("gzip", "BINARY");
knownCharsets.put("application/xml", "UTF-8");
knownCharsets.put("application/json", "UTF-8");
}
}